diff --git a/.Doxyfile b/.Doxyfile index c3386af2..9dbfe4ba 100644 --- a/.Doxyfile +++ b/.Doxyfile @@ -771,8 +771,11 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = ./inference/engine/api ./inference/flow/include/flow.h \ -./common/uni/include/task.h ./inference/flow/src/flow.proto +INPUT = ./inference/engine/api/c \ +./inference/engine/api/java \ +./inference/flow/include/flow.h ./common/uni/include/task.h ./inference/flow/src/flow.proto \ +./training/api/training/api/API.h \ +./training/demos/common/training.h # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/.gitignore b/.gitignore index 587e6768..016a22bc 100644 --- a/.gitignore +++ b/.gitignore @@ -53,6 +53,8 @@ kit/Android/SimpleImageClassification/app/src/main/java kit/iOS/SimpleImgClassfication/libbolt kit/Android/Semantics/app/src/main/java kit/Android/Semantics/app/src/main/assets/ +kit/Android +kit/iOS final_combinations.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d06678f..9ca700b2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,9 @@ if (NOT "$ENV{JNI_ROOT}" STREQUAL "") set(USE_JNI ON) endif(JNI_FOUND) endif () +if (USE_SECURE_C) + find_package(SecureC) +endif () if (BUILD_TEST) find_package(jpeg) if (EXISTS ${OpenCV_CMAKE_PATH}) @@ -33,7 +36,7 @@ if (BUILD_TEST) endif (BUILD_TEST) add_subdirectory(common) -if (USE_CAFFE OR USE_ONNX OR USE_TFLITE OR USE_TENSORFLOW) +if (USE_CAFFE OR USE_ONNX OR USE_TFLITE OR USE_TENSORFLOW OR USE_MINDSPORE) add_subdirectory(model_tools) endif() add_subdirectory(compute) @@ -45,6 +48,13 @@ message(STATUS "CXXFLAGS: ${CMAKE_CXX_FLAGS}") add_custom_target(bolt_library ALL COMMAND bash ./scripts/build_light_bolt.sh ${CMAKE_SYSTEM_NAME} ${CMAKE_CXX_COMPILER} ${CMAKE_AR} ${CMAKE_STRIP} ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_OUTPUT_EXTENSION} ${CMAKE_SHARED_LIBRARY_PREFIX} ${CMAKE_SHARED_LIBRARY_SUFFIX} ${CMAKE_STATIC_LIBRARY_PREFIX} ${CMAKE_STATIC_LIBRARY_SUFFIX} ${CMAKE_BINARY_DIR} WORKING_DIRECTORY ${BOLT_ROOT}) +if (USE_TRAINING) + set(TRAINING_BUILD_C_API ON) + set(TRAINING_BUILD_DEMO ON) + add_subdirectory(training) + add_dependencies(Raul blas_enhance uni) + add_dependencies(Raul blas_enhance_static uni_static) +endif (USE_TRAINING) add_dependencies(bolt_library engine model_spec tensor image blas_enhance uni) add_dependencies(bolt_library engine_static model_spec_static tensor_static image_static blas_enhance_static uni_static) @@ -70,11 +80,30 @@ endif () enable_testing() find_program (BASH_PROGRAM bash) if (BASH_PROGRAM AND USE_GENERAL) + file(GLOB CPUINFO_CMAKE_FILE $ENV{BOLT_ROOT}/common/cmakes/cpuinfo.cmake ${BOLT_ROOT}/common/cmakes/cpuinfo.cmake) + include(${CPUINFO_CMAKE_FILE}) set(parameters --host_dir=${CMAKE_INSTALL_PREFIX}) if (ANDROID) set(parameters ${parameters} -d android --device_dir=/data/local/tmp/uldra) elseif("${CMAKE_HOST_SYSTEM_PROCESSOR}" STREQUAL "${CMAKE_SYSTEM_PROCESSOR}" AND "${CMAKE_HOST_SYSTEM}" MATCHES "${CMAKE_SYSTEM_NAME}*") - set(parameters ${parameters} -d host) + if ("${CMAKE_HOST_SYSTEM_PROCESSOR}" STREQUAL "aarch64" OR "${CMAKE_HOST_SYSTEM_PROCESSOR}" STREQUAL "armv7") + set(parameters ${parameters} -d host) + elseif (USE_X86) + set(x86_test ${cpuinfo_avx2}) + if (USE_INT8) + set(x86_test ${cpuinfo_avx512}) + endif () + if (USE_AVX512_VNNI) + set(x86_test ${cpuinfo_avx512_vnni}) + endif() + if (x86_test) + set(parameters ${parameters} -d host) + else () + set(parameters ${parameters} -d unknown) + endif () + else () + set(parameters ${parameters} -d unknown) + endif() else() set(parameters ${parameters} -d unknown) endif() diff --git a/README.md b/README.md index 9b37e130..995733cd 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,8 @@ --- [![License: MIT](docs/images/license-mit.png)](https://opensource.org/licenses/MIT) -[Bolt](https://huawei-noah.github.io/bolt/) is a light-weight library for deep learning. Bolt, as a universal deployment tool for all kinds of neural networks, aims to minimize the inference runtime as much as possible. +[Bolt](https://huawei-noah.github.io/bolt/) is a light-weight library for deep learning. +Bolt, as a universal deployment tool for all kinds of neural networks, aims to automate the deployment pipeline and achieve extreme acceleration. Bolt has been widely deployed and used in many departments of HUAWEI company, such as 2012 Laboratory, CBG and HUAWEI Product Lines. If you have questions or suggestions, you can submit issue. **QQ群: 833345709** @@ -11,7 +12,7 @@ If you have questions or suggestions, you can submit issue. **QQ群: 833345709** - **High Performance:** **15%+** faster than existing open source acceleration libraries. - **Rich Model Conversion:** support Caffe, ONNX, TFLite, Tensorflow. - **Various Inference Precision:** support FP32, FP16, INT8, 1-BIT. -- **Multiple platforms:** ARM CPU(v7, v8, v8.2), Mali GPU, Qualcomm GPU, X86 CPU(AVX2, AVX512) +- **Multiple platforms:** ARM CPU(v7, v8, v8.2+), Mali GPU, Qualcomm GPU, X86 CPU(AVX2, AVX512) - **Bolt is the first to support NLP and also supports common CV applications.** - **Minimize ROM/RAM** - Rich Graph Optimization @@ -23,30 +24,42 @@ If you have questions or suggestions, you can submit issue. **QQ群: 833345709** # Building Status --- -Kinds of choices are provided for the compilation of bolt. Please make a suitable choice depending on your environment. - -| target platform | build command | Linux | Windows | MacOS | -| -------------------- | -------------------------------------------- | ----- | ------- | ----- | -| Android(armv7) | ./install.sh --target=android-armv7 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-armv7) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-armv7) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-armv7) | -| Android(armv8+gpu) | ./install.sh --target=android-aarch64 --gpu | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-armv8) | -| Android(x86_64) | ./install.sh --target=android-x86_64 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-x86_64)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-x86_64) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-x86_64)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-x86_64) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-x86_64)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-x86_64) | -| iOS(armv7) | ./install.sh --target=ios-armv7 | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-ios-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-ios-armv7) | -| iOS(armv8) | ./install.sh --target=ios-aarch64 | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-ios-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-ios-armv8) | -| Linux(X86_64) | ./install.sh --target=linux-x86_64 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86) | / | / | -| Linux(x86_64_avx2) | ./install.sh --target=linux-x86_64_avx2 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86-avx2) | / | / | -| Windows(X86_64) | ./install.sh --target=windows-x86_64 | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-x86) | / | -| Windows(x86_64_avx2) | ./install.sh --target=windows-x86_64_avx2 | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-x86-avx2) | / | -| MacOS(X86_64) | ./install.sh --target=macos-x86_64 | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-x86) | -| MacOS(x86_64_avx2) | ./install.sh --target=macos-x86_64_avx2 | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-x86-avx2) | - -*NOTE: Bolt defaultly link static library, This may cause some problem on some platforms. You can use --shared option to link shared library.* +There are some common used platform for inference. More targets can be seen from [scripts/target.sh](scripts/target.sh). Please make a suitable choice depending on your environment. +If you want to build on-device training module, you can add **--train** option. +If you want to use multi-threads parallel, you can add **--openmp** option. + +*Bolt defaultly link static library, This may cause some problem on some platforms. You can use --shared option to link shared library.* + +| target platform | precision | build command | Linux | Windows | MacOS | +| ---------------------- | ------------------ | ---------------------------------------------------- | ----- | ------- | ----- | +| Android(armv7) | fp32,int8 | ./install.sh --target=android-armv7 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-armv7) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-armv7) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-armv7) | +| Android(armv8) | fp32,int8 | ./install.sh --target=android-aarch64 --fp16=off | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-armv8) | +| Android(armv8.2+) | fp32,fp16,int8,bnn | ./install.sh --target=android-aarch64 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-armv8) | +| Android(gpu) | fp16 | ./install.sh --target=android-aarch64 --gpu | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-armv8) | +| Android(x86_64) | fp32,int8 | ./install.sh --target=android-x86_64 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-x86_64)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-x86_64) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-x86_64)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-x86_64) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-x86_64)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-x86_64) | +| iOS(armv7) | fp32,int8 | ./install.sh --target=ios-armv7 | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-ios-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-ios-armv7) | +| iOS(armv8) | fp32,int8 | ./install.sh --target=ios-aarch64 --fp16=off | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-ios-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-ios-armv8) | +| iOS(armv8.2+) | fp32,fp16,int8,bnn | ./install.sh --target=ios-aarch64 | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-ios-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-ios-armv8) | +| Linux(armv7) | fp32,int8 | ./install.sh --target=linux-armv7_blank | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86) | / | / | +| Linux(armv8) | fp32,int8 | ./install.sh --target=linux-aarch64_blank --fp16=off | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86) | / | / | +| Linux(armv8.2+) | fp32,fp16,int8,bnn | ./install.sh --target=linux-aarch64_blank | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86) | / | / | +| Linux(x86_64) | fp32,int8 | ./install.sh --target=linux-x86_64 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86) | / | / | +| Linux(x86_64_avx2) | fp32 | ./install.sh --target=linux-x86_64_avx2 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86-avx2) | / | / | +| Linux(x86_64_avx512) | fp32,int8 | ./install.sh --target=linux-x86_64_avx512 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86-avx2) | / | / | +| Windows(x86_64) | fp32,int8 | ./install.sh --target=windows-x86_64 | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-x86) | / | +| Windows(x86_64_avx2) | fp32 | ./install.sh --target=windows-x86_64_avx2 | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-x86-avx2) | / | +| Windows(x86_64_avx512) | fp32,int8 | ./install.sh --target=windows-x86_64_avx512 | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-x86-avx2) | / | +| MacOS(armv8.2+) | fp32,fp16,int8,bnn | ./install.sh --target=macos-aarch64 | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-x86) | +| MacOS(x86_64) | fp32,int8 | ./install.sh --target=macos-x86_64 | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-x86) | +| MacOS(x86_64_avx2) | fp32 | ./install.sh --target=macos-x86_64_avx2 | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-x86-avx2) | +| MacOS(x86_64_avx512) | fp32,int8 | ./install.sh --target=macos-x86_64_avx512 | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-x86-avx2) | # Quick Start ---
Two steps to get started with bolt. -1. Conversion: use **[X2bolt](model_tools/tools/X2bolt/X2bolt.cpp)** to convert your model from caffe,onnx,tflite or tensorflow to .bolt; +1. Conversion: use **[X2bolt](model_tools/tools/X2bolt/X2bolt.cpp)** to convert your model from caffe, onnx, tflite or tensorflow to .bolt file; 2. Inference: run **[benchmark](inference/examples/benchmark/benchmark.cpp)** with .bolt and data to get the inference result. @@ -56,9 +69,10 @@ Two steps to get started with bolt. Here we show some interesting and useful applications in bolt. -| Face Detection | ASR | Semantics Analysis | Image Classification -| :------: | :------: | :------: |:------: -| [![face_detection](docs/images/20_bolt_face_detection.gif)](inference/examples/ultra_face) demo_link: [face detection](inference/examples/ultra_face) | [![asr](docs/images/ChineseSpeechRecognition.gif)]() demo_link: [asr](inference/examples/automatic_speech_recognition) | [![semantics analysis](docs/images/SemanticsAnalysis.gif)]() demo_link: [semantics analysis](kit/Android/Semantics) | [![image_classification](docs/images/ImageClassification.gif)]() demo_link: [image classification](inference/examples/image_classification) +| Face Detection | ASR | Semantics Analysis | Image Classification | Reading Comprehension | +| :------: | :------: | :------: | :------: | :------: | +| ![face_detection](docs/images/20_bolt_face_detection.gif) [android](kit/Android/FaceDetection) [ios](kit/iOS/FaceDetection) [exe](inference/examples/ultra_face) | ![asr](docs/images/ChineseSpeechRecognition.gif) [android](kit/Android/ChineseSpeechRecognition) [ios](kit/iOS/ChineseSpeechRecognition) | ![semantics analysis](docs/images/SemanticsAnalysis.gif) [android](kit/Android/Semantics) | ![image_classification](docs/images/ImageClassification.gif) [android](kit/Android/SimpleImageClassification) [ios](kit/iOS/SimpleImageClassification) | ![reading_comprehension](docs/images/ReadingComprehension.gif) [android](kit/Android/ReadingComprehension) | + # Verified Networks --- Bolt has shown its high performance in the inference of common CV and NLP neural networks. Some of the representative networks that we have verified are listed below. You can find detailed benchmark information in [docs/BENCHMARK.md](docs/BENCHMARK.md). @@ -81,16 +95,32 @@ Two steps to get started with bolt. BiRealNet, ReActNet, Ghostnet, - SSD, Yolov3, Pointnet, ViT, TNT ... + unet, + LCNet, Pointnet, + hair-segmentation, + duc, + fcn, + retinanet, + SSD, + Faster-RCNN, + Mask-RCNN, + Yolov2, + Yolov3, + Yolov4, + Yolov5, + ViT, TNT ... NLP - Bert, - Albert, Neural Machine Translation, Text To Speech, Automatic Speech Recognition, - Tdnn ... + Bert, Albert, Tinybert, Neural Machine Translation, Text To Speech(Tactron,Tactron2,FastSpeech+hifigan,melgan), Automatic Speech Recognition, DFSMN, Conformer, + Tdnn, + FRILL, + T5, + GPT-2, + Roberta ... @@ -111,18 +141,23 @@ Two steps to get started with bolt. - More models than these mentioned above are supported, users are encouraged to further explore. + More models than these mentioned above are supported, users are encouraged to further explore. + +# On-Device Training +--- +On-Device Training has come, it's a beta vesion which supports [Lenet](./training/demos/lenet_demo/), [Mobilenet_v1](./training/demos/mobilenet_v1_demo) and [Resnet18](./training/demos/resnet18_demo) for training on the embedded devices and servers. Want more details of on-device training in bolt? Get with the official training [tutorial](./training/TUTORIAL.md). # Documentations --- Everything you want to know about bolt is recorded in the detailed documentations stored in [docs](docs). -- [How to install bolt with different compilers](docs/INSTALL.md). -- [How to use bolt to inference your ML models.](docs/USER_HANDBOOK.md) -- [How to develop bolt to customize more models.](docs/DEVELOPER.md) +- [How to install bolt with different compilers?](docs/INSTALL.md). +- [How to use bolt to inference your ML models?](docs/USER_HANDBOOK.md) +- [How to develop bolt to customize more models?](docs/DEVELOPER.md) - [Operators documentation](docs/OPERATORS.md) - [Benchmark results on some universal models.](docs/BENCHMARK.md) -- [How to build demo/example with kit.](docs/KIT.md) +- [How to visualise/protect bolt model?](docs/USER_HANDBOOK.md#model-visualization) +- [How to build demo/example with kit?](docs/KIT.md) - [Frequently Asked Questions(FAQ)](docs/FAQ.md) # Articles @@ -133,6 +168,7 @@ Everything you want to know about bolt is recorded in the detailed documentation - [Bolt GPU性能优化,让上帝帮忙掷骰子](https://zhuanlan.zhihu.com/p/336218879) - [Bolt助力HMS机器翻译,自然语言处理又下一城](https://zhuanlan.zhihu.com/p/337887620) - [ARM CPU 1-bit推理,走向极致的道路](https://zhuanlan.zhihu.com/p/158161592) +- [基于深度学习加速库Bolt的声音克隆技术(Voice Cloning)](https://zhuanlan.zhihu.com/p/498919929) # 教程 --- @@ -141,7 +177,8 @@ Everything you want to know about bolt is recorded in the detailed documentation - 情感分类: [Android Demo](https://zhuanlan.zhihu.com/p/414971037) - 中文语音识别: [Android Demo](https://zhuanlan.zhihu.com/p/414978782), [iOS Demo](https://zhuanlan.zhihu.com/p/414981121) - 人脸检测: [Android Demo](https://zhuanlan.zhihu.com/p/414975102), [iOS Demo](https://zhuanlan.zhihu.com/p/414971375) - +- 阅读理解: [Android Demo](https://zhuanlan.zhihu.com/p/498906834) +- # Acknowledgement --- Bolt refers to the following projects: [caffe](https://github.com/BVLC/caffe), [onnx](https://github.com/onnx/onnx), [tensorflow](https://github.com/tensorflow/tensorflow), [ncnn](https://github.com/Tencent/ncnn), [mnn](https://github.com/alibaba/MNN), [dabnn](https://github.com/JDAI-CV/dabnn). diff --git a/SUMMARY.md b/SUMMARY.md new file mode 100644 index 00000000..7a4216cd --- /dev/null +++ b/SUMMARY.md @@ -0,0 +1,42 @@ +# Summary + +* [Introduction](README.md) + + +* [Architechture](docs/ARCHITECTURE.md) + + +* [Operators](docs/OPERATORS.md) + + +* [Install](docs/INSTALL.md) + + +* [Basic Inference Usage](docs/USER_HANDBOOK.md#basic-usage) + + +* [Basic On-device Training Usage](training/TUTORIAL.md) + + +* [Advanced Features](docs/USER_HANDBOOK.md#advanced-features) + + +* [Developer Customization](docs/DEVELOPER.md) + + +* [How to Reduce GPU Initial Time](docs/REDUCE_GPU_PREPARE_TIME.md) + + +* [Kit Example](docs/KIT.md) + + +* [Changelog](docs/CHANGELOG.md) + + +* [FAQ](docs/FAQ.md) + + +* [Feedback](docs/FEEDBACK.md) + + +* [Appendix](docs/IOS_USAGE.md) diff --git a/book.json b/book.json new file mode 100644 index 00000000..baf158d1 --- /dev/null +++ b/book.json @@ -0,0 +1,22 @@ +{ + "plugins": [ + "github", + "back-to-top-button", + "page-toc-button", + "insert-logo" + ], + + "pluginsConfig": { + "github": { + "url": "https://github.com/huawei-noah/bolt" + }, + "page-toc-button": { + "maxTocDepth": 1, + "minTocSize": 2 + }, + "insert-logo":{ + "url":"../docs/images/LOGO.PNG", + "style":"background:none;max-height:100px" + } + } +} diff --git a/common/cmakes/FindSecureC.cmake b/common/cmakes/FindSecureC.cmake new file mode 100644 index 00000000..72a8ed82 --- /dev/null +++ b/common/cmakes/FindSecureC.cmake @@ -0,0 +1,24 @@ +find_path(SecureC_INCLUDE_DIR NAMES securec.h HINTS $ENV{SecureC_ROOT}/include ${SecureC_ROOT}/include) + +if (USE_DYNAMIC_LIBRARY) + find_library(SecureC_LIBRARY NAMES securec HINTS $ENV{SecureC_ROOT}/lib ${SecureC_ROOT}/lib) + set(SecureC_SHARED_LIBRARY ${SecureC_LIBRARY}) +else (USE_DYNAMIC_LIBRARY) + find_library(SecureC_LIBRARY NAMES ${CMAKE_STATIC_LIBRARY_PREFIX}securec${CMAKE_STATIC_LIBRARY_SUFFIX} HINTS $ENV{SecureC_ROOT}/lib ${SecureC_ROOT}/lib) + find_library(SecureC_SHARED_LIBRARY NAMES securec HINTS $ENV{SecureC_ROOT}/lib ${SecureC_ROOT}/lib) +endif (USE_DYNAMIC_LIBRARY) + +if (SecureC_INCLUDE_DIR AND SecureC_LIBRARY) + set(SecureC_FOUND true) +endif (SecureC_INCLUDE_DIR AND SecureC_LIBRARY) + +if (SecureC_FOUND) + include_directories(${SecureC_INCLUDE_DIR}) + message(STATUS "Found securec.h: ${SecureC_INCLUDE_DIR}") + message(STATUS "Found securec: ${SecureC_LIBRARY}") +else (SecureC_FOUND) + message(FATAL_ERROR " +FATAL: can not find securec library in /[include|lib] directory, + please set shell environment variable SecureC_ROOT. + ") +endif (SecureC_FOUND) diff --git a/common/cmakes/bolt.cmake b/common/cmakes/bolt.cmake index 04308d5b..3f0378d1 100644 --- a/common/cmakes/bolt.cmake +++ b/common/cmakes/bolt.cmake @@ -12,6 +12,7 @@ option(USE_CAFFE "set use caffe model as input or not" OFF) option(USE_ONNX "set use onnx model as input or not" OFF) option(USE_TFLITE "set use tflite model as input or not" OFF) option(USE_TENSORFLOW "set use tensorflow model as input or not" OFF) +option(USE_MINDSPORE "set use mindspore model as input or not" OFF) # blas_enhance tensor option(USE_GENERAL "set use CPU serial code or not" OFF) @@ -26,12 +27,23 @@ option(USE_INT8_WINOGRAD "set use ARM NEON INT8 winograd" ON) option(USE_OPENMP "set use openmp to run test(tinybert) or not" OFF) option(USE_LIBRARY_TUNING "set use algorithm tuning or not" OFF) +option(USE_MEM_CHECK "set to use memory check or not" OFF) +option(USE_MODEL_PRINT "set to use model print or not" ON) +option(USE_SECURE_C "set to use Huawei Secure C or not" OFF) + +option(USE_TRAINING "set whether to use training or not" OFF) option(USE_FLOW "set whether to use flow or not" OFF) option(USE_JNI "set whether to use Java API or not" OFF) option(BUILD_TEST "set to build unit test or not" OFF) +include(CheckCXXCompilerFlag) + +if (USE_TRAINING) + set(ANDROID_TOOLCHAIN_PREFIX "aarch64-linux-android-") +endif(USE_TRAINING) + function (set_policy) if (POLICY CMP0074) cmake_policy(SET CMP0074 NEW) @@ -39,15 +51,19 @@ function (set_policy) endfunction(set_policy) macro (set_c_cxx_flags) - set(COMMON_FLAGS "-W -Wextra -O3 -fPIC") - if (NOT WIN32) - set(COMMON_FLAGS "${COMMON_FLAGS} -fstack-protector-all") - endif() + set(COMMON_FLAGS "-O3 -fPIC -fPIE") + # warning flag can be remove in release version + set(COMMON_FLAGS "${COMMON_FLAGS} -W -Wextra") + set(COMMON_FLAGS "${COMMON_FLAGS} -fstack-protector-all -fstack-protector-strong") set(COMMON_FLAGS "${COMMON_FLAGS} -Wno-unused-command-line-argument -Wno-unused-parameter") set(COMMON_FLAGS "${COMMON_FLAGS} -Wno-unused-result -Wno-deprecated-declarations -Wno-unused-variable") if (USE_OPENMP) set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_OPENMP -fopenmp") + CHECK_CXX_COMPILER_FLAG("-static-openmp" COMPILER_SUPPORTS_STATIC_OPENMP) + if (COMPILER_SUPPORTS_STATIC_OPENMP) + set(COMMON_FLAGS "${COMMON_FLAGS} -static-openmp") + endif () endif(USE_OPENMP) if (USE_THREAD_SAFE OR USE_CAFFE OR USE_ONNX OR USE_FLOW) @@ -99,27 +115,29 @@ macro (set_c_cxx_flags) if (USE_INT8) set(COMMON_FLAGS "${COMMON_FLAGS} -mavx512f") endif (USE_INT8) - if (USE_AVX512_VNNI) + if (USE_AVX512_VNNI) set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_AVX512_VNNI") - endif(USE_AVX512_VNNI) + endif(USE_AVX512_VNNI) endif(USE_X86) if (USE_FP32) set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_FP32") endif (USE_FP32) + if (USE_FP16) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_FP16") + if (USE_F16_MIX_PRECISION) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_F16_MIX_PRECISION") + endif (USE_F16_MIX_PRECISION) + endif () + if (USE_INT8) set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_INT8") endif (USE_INT8) if (USE_NEON) set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_NEON") - if (USE_FP16) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_FP16") - if (USE_F16_MIX_PRECISION) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_F16_MIX_PRECISION") - endif (USE_F16_MIX_PRECISION) if (USE_INT8) if (USE_INT8_WINOGRAD) set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_INT8_WINOGRAD") @@ -137,9 +155,6 @@ macro (set_c_cxx_flags) endif () endif (USE_INT8) endif (USE_FP16) - if (USE_INT8) - set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_INT8") - endif () endif(USE_NEON) if (USE_CAFFE) @@ -154,6 +169,21 @@ macro (set_c_cxx_flags) if (USE_TENSORFLOW) set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_TENSORFLOW") endif() + if (USE_MINDSPORE) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_MINDSPORE") + endif() + + if (USE_MEM_CHECK) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_MEM_CHECK") + endif() + + if (USE_MODEL_PRINT) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_MODEL_PRINT") + endif() + + if (USE_SECURE_C) + set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_SECURE_C") + endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS} -std=gnu99") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS} -std=c++11") @@ -168,8 +198,14 @@ endmacro(set_c_cxx_flags) macro (set_test_c_cxx_flags) if (NOT USE_DYNAMIC_LIBRARY) set(COMMON_FLAGS "${COMMON_FLAGS} -static-libstdc++") - if (NOT "${CMAKE_HOST_SYSTEM_PROCESSOR}" STREQUAL "${CMAKE_SYSTEM_PROCESSOR}" AND "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") - set(COMMON_FLAGS "${COMMON_FLAGS} -static") + if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + set(COMMON_FLAGS "${COMMON_FLAGS} -static-libgcc") + if (NOT "${CMAKE_HOST_SYSTEM_PROCESSOR}" STREQUAL "${CMAKE_SYSTEM_PROCESSOR}") + set(COMMON_FLAGS "${COMMON_FLAGS} -static") + endif() + if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows") + set(COMMON_FLAGS "${COMMON_FLAGS} -static") + endif() endif() endif() @@ -198,6 +234,7 @@ if(USE_DYNAMIC_LIBRARY) set(model_tools_onnx_library model_tools_onnx) set(model_tools_tflite_library model_tools_tflite) set(model_tools_tensorflow_library model_tools_tensorflow) + set(model_tools_mindspore_library model_tools_mindspore) set(model_tools_library model_tools) set(engine_library engine) set(flow_library flow) @@ -213,6 +250,7 @@ else() set(model_tools_onnx_library model_tools_onnx_static) set(model_tools_tflite_library model_tools_tflite_static) set(model_tools_tensorflow_library model_tools_tensorflow_static) + set(model_tools_mindspore_library model_tools_mindspore_static) set(model_tools_library model_tools_static) set(engine_library engine_static) set(flow_library flow_static) @@ -220,10 +258,16 @@ endif() macro(include_uni) include_directories(${BOLT_ROOT}/common/uni/include) + if (USE_SECURE_C) + include_directories(${SecureC_ROOT}/include) + endif () endmacro() macro(link_uni name) target_link_libraries(${name} ${uni_library}) + if (USE_SECURE_C) + target_link_libraries(${name} ${SecureC_LIBRARY}) + endif () endmacro() macro(include_model_spec) @@ -330,7 +374,10 @@ macro(link_model_tools name) target_link_libraries(${name} ${model_tools_tensorflow_library}) target_link_libraries(${name} ${JSONCPP_LIBRARY}) endif() - if(USE_CAFFE OR USE_ONNX) + if(USE_MINDSPORE) + target_link_libraries(${name} ${model_tools_mindspore_library}) + endif() + if(USE_CAFFE OR USE_ONNX OR USE_MINDSPORE) link_protobuf(${name}) endif() link_model_spec(${name}) diff --git a/common/cmakes/cpuinfo.cmake b/common/cmakes/cpuinfo.cmake new file mode 100644 index 00000000..87c94f77 --- /dev/null +++ b/common/cmakes/cpuinfo.cmake @@ -0,0 +1,16 @@ +set(CPUINFO "null") +file(GLOB CPUINFO_FILE /proc/cpuinfo) +if (CPUINFO_FILE) + exec_program(cat ARGS ${CPUINFO_FILE} OUTPUT_VARIABLE CPUINFO) +else () + message(STATUS "can not find /proc/cpuinfo") +endif () + +macro(check_cpuinfo feature) + string(REGEX REPLACE "^.*(${feature}).*$" "\\1" _FEATURE_THERE ${CPUINFO}) + string(COMPARE EQUAL "${feature}" "${_FEATURE_THERE}" cpuinfo_${feature}) +endmacro() + +check_cpuinfo(avx2) +check_cpuinfo(avx512) +check_cpuinfo(avx512_vnni) diff --git a/common/gcl/include/gcl_common.h b/common/gcl/include/gcl_common.h index 0e0e16c2..2836d69c 100644 --- a/common/gcl/include/gcl_common.h +++ b/common/gcl/include/gcl_common.h @@ -130,13 +130,14 @@ inline CI8 *map_cl_error_2_string(cl_int err) } } -#define map_cl_error_2_ee(err) \ - { \ - if (err == 0) \ - return SUCCESS; \ - UNI_ERROR_LOG("GCLAPI error in: File: %s Line: %d Func name is: %s GCLERROR = %s\n", \ - __FILE__, __LINE__, __FUNCTION__, map_cl_error_2_string(err)); \ - return GCL_ERROR; \ +#define map_cl_error_2_ee(err) \ + { \ + if (err == 0) { \ + return SUCCESS; \ + } else { \ + UNI_ERROR_LOG("GCLAPI error: %s.\n", map_cl_error_2_string(err)); \ + return GCL_ERROR; \ + } \ } inline EE has_dedicated_local(Device device, I32 *b) @@ -171,6 +172,14 @@ struct GCLKernelInfo { std::string name; }; +typedef struct { + I32 algorithm; + U32 best_h[6]; + U32 best_c[6]; + U32 best_k[6]; +} ForwardRunInfoMali; +typedef ForwardRunInfoMali *ForwardRunInfoMali_t; + struct GCLHandle { Platform *platforms; U32 numPlatform; @@ -201,6 +210,8 @@ struct GCLHandle { std::string deviceName; std::map kernelMap; std::map programMap; + std::map, ForwardRunInfoMali> runInfoCache; + std::map> kernelLSCache; std::vector *kernelVec; std::string curOpName; void *kernel_source; @@ -221,14 +232,6 @@ struct GCLHandleConfig { typedef GCLHandleConfig *GCLHandleConfig_t; -typedef struct { - I32 algorithm; - U32 best_h[6]; - U32 best_c[6]; - U32 best_k[6]; -} ForwardRunInfoMali; -typedef ForwardRunInfoMali *ForwardRunInfoMali_t; - typedef struct { GCLHandle_t handle; GCLMemDesc_t gclmemFilterDesc; diff --git a/common/gcl/include/gcl_func.h b/common/gcl/include/gcl_func.h index 515c1486..93cb7130 100644 --- a/common/gcl/include/gcl_func.h +++ b/common/gcl/include/gcl_func.h @@ -559,7 +559,8 @@ inline EE gcl_create_kernel_with_source_map( option = handle->common_source_opt + " " + option; } if (!kernel_source->get_source(sourceName, &source_ptr)) { - UNI_ERROR_LOG("the %s doesn't exist in sourceMap\n", sourceName); + UNI_ERROR_LOG( + "the %s doesn't exist in sourceMap to find kernel %s.\n", sourceName, kernelName); CHECK_STATUS(NULL_POINTER); } @@ -878,6 +879,53 @@ inline EE gcl_run_kernel( return SUCCESS; } +inline EE gcl_get_kernel_name(Kernel kernel, I8 *kernelName) +{ + char name[256]; + U32 len; + CHECK_STATUS(get_kernel_name(kernel, name, &len)); + if (len > 256) { + UNI_ERROR_LOG("KernelName length %d > 256, please reset name array length\n", len); + CHECK_STATUS(NOT_MATCH); + } else { + UNI_STRCPY(kernelName, name); + } + return SUCCESS; +} + +inline void gcl_set_kernel_ls_to_cache(GCLHandle_t handle, CI8 *kernelName, U32 gs[3], U32 ls[3]) +{ + std::string name = kernelName; + name += "_" + std::to_string(gs[0]); + name += "_" + std::to_string(gs[1]); + name += "_" + std::to_string(gs[2]); + std::vector lsVec = {ls[0], ls[1], ls[2]}; + if (handle->kernelLSCache.find(name) == handle->kernelLSCache.end()) { + handle->kernelLSCache[name] = lsVec; + } +} + +inline bool gcl_get_kernel_ls_from_cache(GCLHandle_t handle, CI8 *kernelName, U32 gs[3], U32 ls[3]) +{ + std::string name = kernelName; + name += "_" + std::to_string(gs[0]); + name += "_" + std::to_string(gs[1]); + name += "_" + std::to_string(gs[2]); + if (handle->kernelLSCache.find(name) != handle->kernelLSCache.end()) { + for (U32 i = 0; i < 3; i++) { + ls[i] = handle->kernelLSCache[name][i]; + } + UNI_DEBUG_LOG("get kernel %s ls from cache success, gs is {%d %d %d}, ls is {%d %d %d}\n", + kernelName, gs[0], gs[1], gs[2], ls[0], ls[1], ls[2]); + return true; + } else { + UNI_DEBUG_LOG("get kernel %s ls from cache fail, try to find best ls for kernel, gs is {%d " + "%d %d}\n", + kernelName, gs[0], gs[1], gs[2]); + return false; + } +} + inline U32 get_next_ls_size(U32 ls_size) { return (ls_size << 1); @@ -969,16 +1017,20 @@ inline EE gcl_run_kernelVec_select_ls(GCLHandle_t handle, std::vector kerne for (auto index : kernelIndex) { auto kernelInfo = (*handle->kernelVec)[index]; bool needSelectLs = false; + U32 gs[3] = {0, 0, 0}; for (U32 i = 0; i < kernelInfo.dim; i++) { if (kernelInfo.ls[i] == 0) { needSelectLs = true; - break; } + gs[i] = kernelInfo.gs[i]; } if (!needSelectLs) { continue; } CHECK_STATUS(gcl_run_kernel_select_ls(handle, &kernelInfo)); + char kernelName[256]; + gcl_get_kernel_name(kernelInfo.kernel, kernelName); + gcl_set_kernel_ls_to_cache(handle, kernelName, gs, kernelInfo.ls); (*handle->kernelVec)[index].gs[0] = kernelInfo.gs[0]; (*handle->kernelVec)[index].gs[1] = kernelInfo.gs[1]; (*handle->kernelVec)[index].gs[2] = kernelInfo.gs[2]; @@ -995,17 +1047,18 @@ inline EE gcl_infer_best_kernelVec_ls_with_map( { std::vector kernelIndex; U32 len = handle->kernelVec->size(); + bool needSaveKernelThreadInfoToMap = false; for (U32 i = 0; i < len; i++) { auto kernelInfo = (*handle->kernelVec)[i]; - U32 gs[3]; - U32 ls[3]; + U32 gs[3] = {0}; + U32 ls[3] = {0}; bool findKernelThreadInfo = false; findKernelThreadInfo = algoMap->getKernelThreadInfoFromMap(kernelInfo.name, gs, ls); U32 dim = (*handle->kernelVec)[i].dim; if (findKernelThreadInfo) { U32 cur_gs[3]; for (U32 j = 0; j < dim; j++) { - cur_gs[j] = (*handle->kernelVec)[i].gs[j]; + cur_gs[j] = kernelInfo.gs[j]; if (ls[j] != 0) { cur_gs[j] = (cur_gs[j] + ls[j] - 1) / ls[j] * ls[j]; } @@ -1014,16 +1067,29 @@ inline EE gcl_infer_best_kernelVec_ls_with_map( } } else { bool noNeedInferLS = true; + needSaveKernelThreadInfoToMap = true; for (U32 j = 0; j < dim; j++) { - gs[j] = (*handle->kernelVec)[i].gs[j]; - ls[j] = (*handle->kernelVec)[i].ls[j]; + gs[j] = kernelInfo.gs[j]; + ls[j] = kernelInfo.ls[j]; if (ls[j] == 0) { noNeedInferLS = false; } } + if (!noNeedInferLS) { + char kernelName[256]; + gcl_get_kernel_name(kernelInfo.kernel, kernelName); + if (gcl_get_kernel_ls_from_cache(handle, kernelName, gs, ls)) { + for (U32 j = 0; j < dim; j++) { + (*handle->kernelVec)[i].ls[j] = ls[j]; + } + noNeedInferLS = true; + } + } if (noNeedInferLS) { for (U32 j = 0; j < dim; j++) { - (*handle->kernelVec)[i].gs[j] = (gs[j] + ls[j] - 1) / ls[j] * ls[j]; + if (ls[j] > 0) { + (*handle->kernelVec)[i].gs[j] = (gs[j] + ls[j] - 1) / ls[j] * ls[j]; + } } } if (!noNeedInferLS) { @@ -1032,9 +1098,11 @@ inline EE gcl_infer_best_kernelVec_ls_with_map( } } CHECK_STATUS(gcl_run_kernelVec_select_ls(handle, kernelIndex)); - for (U32 i = 0; i < len; i++) { - auto kernelInfo = (*handle->kernelVec)[i]; - algoMap->setKernelThreadInfoToMap(kernelInfo.name, kernelInfo.gs, kernelInfo.ls); + if (needSaveKernelThreadInfoToMap) { + for (U32 i = 0; i < len; i++) { + auto kernelInfo = (*handle->kernelVec)[i]; + algoMap->setKernelThreadInfoToMap(kernelInfo.name, kernelInfo.gs, kernelInfo.ls); + } } return SUCCESS; } @@ -1387,7 +1455,7 @@ inline EE gcl_set_kernelArgs(Kernel kernel, Args... args) inline std::string gclMemDesc2Str(GCLMemDesc desc) { char buff[128]; - snprintf(buff, sizeof(buff), "dt:%s memFormat:%s ", DataTypeName()[desc.dt], + UNI_SNPRINTF(buff, sizeof(buff), "dt:%s memFormat:%s ", DataTypeName()[desc.dt], DataFormatName()[desc.memFormat]); std::string descStr = buff; descStr += "stride("; @@ -1414,6 +1482,28 @@ inline EE gcl_get_image_size(GCLMem_t gclMem, U32 *width, U32 *height, U32 *dept CHECK_STATUS(get_image_size(gclMem->mem, width, height, depth)); return SUCCESS; } + +inline void gcl_set_runInfo_to_cache( + GCLHandle_t handle, std::vector flag, ForwardRunInfoMali runInfo) +{ + if (handle->runInfoCache.find(flag) == handle->runInfoCache.end()) { + handle->runInfoCache[flag] = runInfo; + } +} + +inline bool gcl_get_runInfo_from_cache( + GCLHandle_t handle, std::vector flag, ForwardRunInfoMali_t runInfo) +{ + if (handle->runInfoCache.find(flag) != handle->runInfoCache.end()) { + *runInfo = handle->runInfoCache[flag]; + UNI_DEBUG_LOG("get forward run info from cache success\n"); + return true; + } else { + UNI_DEBUG_LOG("get forward run info from cache fail, try to find best forward run info\n"); + return false; + } +} + #ifdef _DEBUG template inline EE gcl_print_memory(GCLHandle_t handle, GCLMem_t gclMem, CI8 *gclMemName = NULL) diff --git a/common/gcl/include/kernel.h b/common/gcl/include/kernel.h index 5653e1b5..d2147edc 100644 --- a/common/gcl/include/kernel.h +++ b/common/gcl/include/kernel.h @@ -49,6 +49,22 @@ inline EE get_kernel_info(Kernel kernel, cl_kernel_info info, void **value, size map_cl_error_2_ee(ret); } +inline EE get_kernel_name(Kernel kernel, char* name, U32 *len) +{ + if (NULL == name || NULL == len) { + return NULL_POINTER; + } + + size_t lenVal; + cl_int ret = clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, 0, NULL, &lenVal); + if (ret != CL_SUCCESS) { + map_cl_error_2_ee(ret); + } + *len = lenVal; + ret = clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, lenVal, name, NULL); + map_cl_error_2_ee(ret); +} + inline EE get_program_info_from_kernel(Kernel kernel, Program *program) { cl_int ret = clGetKernelInfo(kernel, CL_KERNEL_PROGRAM, sizeof(Program), program, NULL); diff --git a/common/gcl/src/ocl_data_trans.cpp b/common/gcl/src/ocl_data_trans.cpp index ab0aa94d..0b16d266 100644 --- a/common/gcl/src/ocl_data_trans.cpp +++ b/common/gcl/src/ocl_data_trans.cpp @@ -413,7 +413,7 @@ EE ocl_trans_mem( CHECK_STATUS(NOT_MATCH); } CHECK_STATUS(set_padding_opt_mali( - true, Pad_Constant, DT_F16, GCL_MEM_BUF, GCL_MEM_BUF, kernelName, &kernelOpt)); + true, PAD_CONSTANT, DT_F16, GCL_MEM_BUF, GCL_MEM_BUF, kernelName, &kernelOpt)); CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt)); CHECK_STATUS(gcl_set_kernelArgs(kernel, sw_str, sh_str, dw_str, dh_str, 0, 0, sw_str, sh_str, dw_str, dh_str, pl, pr, pt, pb, gs[0], gs[1], srcMem, dstMem)); @@ -494,7 +494,7 @@ EE ocl_map_mem_write( CHECK_STATUS(NOT_MATCH); } CHECK_STATUS(set_padding_opt_mali( - true, Pad_Constant, DT_F16, GCL_MEM_BUF, GCL_MEM_BUF, kernelName, &kernelOpt)); + true, PAD_CONSTANT, DT_F16, GCL_MEM_BUF, GCL_MEM_BUF, kernelName, &kernelOpt)); CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelName, &kernel, &kernelOpt)); CHECK_STATUS(gcl_set_kernelArgs(kernel, w, h, w_str, h_str, offset, 0, w, h, w_str, h_str, pl, pr, pt, pb, gs[0], gs[1], gclMem->mem, gclMem->mem)); diff --git a/common/gcl/tools/gcl_sample/sample.cpp b/common/gcl/tools/gcl_sample/sample.cpp index b496ad07..8d7821a9 100644 --- a/common/gcl/tools/gcl_sample/sample.cpp +++ b/common/gcl/tools/gcl_sample/sample.cpp @@ -10,7 +10,6 @@ // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#ifdef _USE_FP16 #include "gcl.h" #include "ocl_context.h" @@ -128,22 +127,22 @@ int main() oc_str = oc_str / (ot * on); on_str = owh_str * oc_str; - // F16* input_val = (F16*)malloc(inputGclDesc.byteSize); - // F16* filter_val = (F16*)malloc(filterGclDesc.byteSize); - // F16* bias_val = (F16*)malloc(biasGclDesc.byteSize); - // for (U32 i = 0; i < inputGclDesc.num; i++) input_val[i] = (i % 16) * 0.1; - // for (U32 i = 0; i < filterGclDesc.num; i++) filter_val[i] = (i % 16) * 0.1; - // for (U32 i = 0; i < biasGclDesc.num * 4; i++) bias_val[i] = 1.0; - // U32 size[3] = {1, 1, 1}; - // size[0] = inputGclDesc.byteSize; - // CHECK_STATUS(gcl_trans_memory(handle, input_val, input, size, HOST_TO_DEVICE_BUF, CL_TRUE)); - // size[0] = filterGclDesc.byteSize; - // CHECK_STATUS(gcl_trans_memory(handle, filter_val, flt, size, HOST_TO_DEVICE_BUF, CL_TRUE)); - // size[0] = biasGclDesc.num; - // CHECK_STATUS(gcl_trans_memory(handle, bias_val, bias, size, HOST_TO_DEVICE_IMG, CL_TRUE)); + // F16* input_val = (F16*)malloc(inputGclDesc.byteSize); + // F16* filter_val = (F16*)malloc(filterGclDesc.byteSize); + // F16* bias_val = (F16*)malloc(biasGclDesc.byteSize); + // for (U32 i = 0; i < inputGclDesc.num; i++) input_val[i] = (i % 16) * 0.1; + // for (U32 i = 0; i < filterGclDesc.num; i++) filter_val[i] = (i % 16) * 0.1; + // for (U32 i = 0; i < biasGclDesc.num * 4; i++) bias_val[i] = 1.0; + // U32 size[3] = {1, 1, 1}; + // size[0] = inputGclDesc.byteSize; + // CHECK_STATUS(gcl_trans_memory(handle, input_val, input, size, HOST_TO_DEVICE_BUF, CL_TRUE)); + // size[0] = filterGclDesc.byteSize; + // CHECK_STATUS(gcl_trans_memory(handle, filter_val, flt, size, HOST_TO_DEVICE_BUF, CL_TRUE)); + // size[0] = biasGclDesc.num; + // CHECK_STATUS(gcl_trans_memory(handle, bias_val, bias, size, HOST_TO_DEVICE_IMG, CL_TRUE)); // - // CHECK_STATUS(gcl_check_buf(handle, input->mem, inputGclDesc.byteSize, false, "input")); - // CHECK_STATUS(gcl_check_buf(handle, flt->mem, filterGclDesc.byteSize, false, "filter")); + // CHECK_STATUS(gcl_check_buf(handle, input->mem, inputGclDesc.byteSize, false, "input")); + // CHECK_STATUS(gcl_check_buf(handle, flt->mem, filterGclDesc.byteSize, false, "filter")); gcl_finish(handle); for (U32 item_bn = 2; item_bn <= 4; item_bn++) { for (U32 item_kn = 1; item_kn <= 2; item_kn = item_kn * 2) { @@ -160,10 +159,10 @@ int main() } Kernel kernel; - char kernelName[1024]; - sprintf(kernelName, "conv_direct_multi_batch_s1_%d%d%d%d%d", fw, fh, item_w, - item_kn, item_bn); - CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + std::string kernelName = std::string("conv_direct_multi_batch_s1_") + + std::to_string(fw) + std::to_string(fh) + std::to_string(item_w) + + std::to_string(item_kn) + std::to_string(item_bn); + CHECK_STATUS(gcl_create_kernel(handle, kernelName.c_str(), &kernel)); if (oc_str % item_kn != 0) { continue; } @@ -174,7 +173,7 @@ int main() CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iwh_str, ic_str, ih_off, iw_off, oh_str, owh_str, oh_off, ow_off, ow, oc, on, sh, in_str, on_str, gs[0], gs[1], input->mem, flt->mem, bias->mem, output->mem)); - gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName.c_str()); CHECK_STATUS(gcl_run_kernel_select_ls(handle, &kernelVec[0])); #ifdef _DEBUG CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size())); @@ -185,12 +184,11 @@ int main() #else CHECK_STATUS(gcl_run_kernelVec(handle)); #endif - // CHECK_STATUS(gcl_check_buf(handle, output->mem, outputGclDesc.byteSize, false, "output")); - // CHECK_STATUS(gcl_fill_memory_zero(handle, output)); + // CHECK_STATUS(gcl_check_buf(handle, output->mem, outputGclDesc.byteSize, false, "output")); + // CHECK_STATUS(gcl_fill_memory_zero(handle, output)); CHECK_STATUS(gcl_clean_kernelVec(handle)); gcl_finish(handle); } } } } -#endif diff --git a/common/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp b/common/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp index c6f2e89d..469a238f 100644 --- a/common/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp +++ b/common/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp @@ -164,9 +164,9 @@ int main(I32 argc, I8 *argv[]) U32 srcLen = imageLen + half16Len + clcodeLen; I8 *source = new I8[srcLen]; #ifdef CL_VERSION_1_2 - memcpy(source, imagesource, imageLen); + UNI_MEMCPY(source, imagesource, imageLen); #endif - memcpy(source + imageLen, half16source, half16Len); + UNI_MEMCPY(source + imageLen, half16source, half16Len); FileStatus = LoadBinFile(FLAGS_inputFilename, source + imageLen + half16Len, clcodeLen); if (!FileStatus) { printf("load bin file failed\n"); diff --git a/common/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp b/common/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp index 3b73ffba..cfd4c113 100644 --- a/common/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp +++ b/common/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp @@ -12,11 +12,9 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include -#include #include #include #include -#include int main(int argc, char *argv[]) { @@ -55,57 +53,47 @@ int main(int argc, char *argv[]) } binMapName = argv[3]; } else { - printf("please input .bin name + binmapname or input .bin name + .cpp name + binmapname\n"); + printf("[ERROR] please pass xxx.bin name + binmapname or xxx.bin name + xxx.cpp name + " + "binmapname.\n"); + return 1; } FILE *fpbin = fopen(binFile.c_str(), "rb"); if (fpbin == NULL) { - printf("file %s open error\n", binFile.c_str()); + printf("[ERROR] can not open file %s.\n", binFile.c_str()); return 1; } struct stat f_stat; if (stat(binFile.c_str(), &f_stat) == -1) { - printf("file %s get size error\n", binFile.c_str()); + printf("[ERROR] can not get file %s size.\n", binFile.c_str()); fclose(fpbin); return 1; } int filelen = f_stat.st_size; - std::stringstream templen; - templen << filelen; - std::string filelen_st = templen.str(); - std::string str = "#include \"inline_" + std::string(binMapName) + ".h\"\n\nCU32 " + - std::string(charName) + "_len = " + filelen_st + ";\nCU8 " + std::string(charName) + - "[] = {"; - - unsigned char charRead; - std::string appendBuf; - + std::string(charName) + "_len = " + std::to_string(filelen_st) + ";\nCU8 " + + std::string(charName) + "[] = {"; + std::stringstream ss; for (int i = 0; i < filelen; i++) { - appendBuf.clear(); + unsigned char c; if (i % 20 == 0) { - appendBuf += "\n"; + ss << "\n"; } - if (1 != fread(&charRead, 1, 1, fpbin)) { - printf("file %s read error\n", binFile.c_str()); + if (1 != fread(&c, 1, 1, fpbin)) { + printf("[ERROR] can not read file %s content.\n", binFile.c_str()); fclose(fpbin); return 1; } - char tempstr[4]; - sprintf(tempstr, "0x%02x", charRead); - appendBuf += std::string(tempstr); - + ss << "0x" << std::hex << std::setw(2) << std::setfill('0') << i; if (i == filelen - 1) { } else if (i % 20 == 19) { - appendBuf += ","; + ss << ","; } else { - appendBuf += ", "; + ss << ", "; } - str += appendBuf; } - - str += "};"; + str += ss.str() + "};"; std::ofstream file; file.open(cppFile.c_str()); @@ -113,6 +101,5 @@ int main(int argc, char *argv[]) file.close(); fclose(fpbin); - return 0; } diff --git a/common/gcl/tools/kernel_source_compile/kernel_cl2char/cl2char.cpp b/common/gcl/tools/kernel_source_compile/kernel_cl2char/cl2char.cpp index 2dc1cd0e..2a0d5871 100644 --- a/common/gcl/tools/kernel_source_compile/kernel_cl2char/cl2char.cpp +++ b/common/gcl/tools/kernel_source_compile/kernel_cl2char/cl2char.cpp @@ -378,11 +378,7 @@ int main() if (boltEnv == NULL) { UNI_ERROR_LOG("BOLT_ROOT env value has not been set successfully\n"); }; - std::string boltPath = boltEnv; - CI8 lastFlag = boltPath[boltPath.length() - 1]; - if (strcmp(&lastFlag, "/") != 0) { - boltPath += "/"; - } + std::string boltPath = boltEnv + std::string("/"); std::string tensorComputingClPath = "compute/tensor/src/gpu/mali/cl/"; std::string tensorComputingClPathQc = "compute/tensor/src/gpu/mali/cl/qualcomm/"; std::string imageClPath = "compute/image/src/gpu/mali/cl/"; diff --git a/common/memory/include/memory_cpu.hpp b/common/memory/include/memory_cpu.hpp index 7ac82f70..eb654d49 100644 --- a/common/memory/include/memory_cpu.hpp +++ b/common/memory/include/memory_cpu.hpp @@ -18,7 +18,7 @@ inline void *CPUMemoryAlignedAlloc(size_t alignment, size_t bytes) { - void *ptr = (void **)operator new(bytes + sizeof(void *) + alignment - 1); + void *ptr = (void **)UNI_OPERATOR_NEW(bytes + sizeof(void *) + alignment - 1); CHECK_REQUIREMENT(ptr != NULL); void **aligned_ptr = (void **)(((uintptr_t)(ptr) + sizeof(void *) + alignment - 1) & ~(alignment - 1)); @@ -28,7 +28,7 @@ inline void *CPUMemoryAlignedAlloc(size_t alignment, size_t bytes) inline void CPUMemoryAlignedfree(void *aligned_ptr) { - operator delete(((void **)aligned_ptr)[-1]); + UNI_OPERATOR_DELETE(((void **)aligned_ptr)[-1]); } class CpuMemory : public Memory { @@ -39,7 +39,8 @@ class CpuMemory : public Memory { this->allocated = false; } - ~CpuMemory() = default; + ~CpuMemory() + {} std::shared_ptr clone(bool allocate) override { @@ -71,13 +72,13 @@ class CpuMemory : public Memory { this->capacitySize = size; try { #ifndef _USE_X86 - this->val = std::shared_ptr((U8 *)operator new(size)); + this->val = std::shared_ptr((U8 *)UNI_OPERATOR_NEW(size), UNI_OPERATOR_DELETE); #else this->val = std::shared_ptr( (U8 *)CPUMemoryAlignedAlloc(64, size), CPUMemoryAlignedfree); #endif } catch (const std::bad_alloc &e) { - UNI_ERROR_LOG("CPU memory alloc %d bytes failed\n", (int)size); + UNI_ERROR_LOG("CPU memory alloc %d bytes failed.\n", (int)size); } } this->allocated = true; @@ -179,7 +180,7 @@ class CpuMemory : public Memory { std::string string(U32 num, F32 factor) override { U32 capacityNum = this->capacitySize / bytesOf(this->desc.dt); - std::string line = "desc: " + tensorDesc2Str(this->desc) + " data:"; + std::string line = "desc:" + tensorDesc2Str(this->desc) + " data:"; for (U32 i = 0; i < num && i < capacityNum; i++) { line = line + std::to_string(this->element(i) / factor) + " "; } @@ -187,7 +188,7 @@ class CpuMemory : public Memory { for (U32 i = 0; i < UNI_MIN(tensorNumElements(this->desc), capacityNum); i++) { sum += this->element(i) / factor; } - line += " sum: " + std::to_string(sum); + line += " sum:" + std::to_string(sum); return line; } diff --git a/common/memory/include/memory_ocl.hpp b/common/memory/include/memory_ocl.hpp index a7194cce..488880f1 100644 --- a/common/memory/include/memory_ocl.hpp +++ b/common/memory/include/memory_ocl.hpp @@ -23,7 +23,7 @@ class OclMemory : public Memory { public: OclMemory() { - memset(&(this->desc), 0, sizeof(GCLMemDesc)); + UNI_MEMSET(&(this->desc), 0, sizeof(GCLMemDesc)); this->desc.memFormat = DF_NCHW; this->desc.memType = GCL_MEM_BUF; this->desc.flags = CL_MEM_READ_WRITE; @@ -202,14 +202,14 @@ class OclMemory : public Memory { if (!allocated) { U8 *tmp = nullptr; if (size < this->desc.byteSize) { - U8 *tmp = (U8 *)operator new(this->desc.byteSize); - memset(tmp, 0, this->desc.byteSize); - memcpy(tmp, host_ptr, size); + U8 *tmp = (U8 *)UNI_OPERATOR_NEW(this->desc.byteSize); + UNI_MEMSET(tmp, 0, this->desc.byteSize); + UNI_MEMCPY(tmp, host_ptr, size); host_ptr = tmp; } this->alloc(host_ptr); - if (tmp) { - delete tmp; + if (tmp != nullptr) { + UNI_OPERATOR_DELETE(tmp); } } else { this->val->desc = this->desc; //TODO DELETE AFTER SPLITE DESC FROM GCLMEM @@ -345,7 +345,7 @@ class OclMemory : public Memory { std::string string(U32 num, F32 factor) override { - std::string line = "desc: " + gclMemDesc2Str(this->desc) + " data: "; + std::string line = "desc:" + gclMemDesc2Str(this->desc) + " data:"; #ifdef _DEBUG DataType dt = (this->desc.dt == DT_U8) ? DT_F16 : this->desc.dt; if (dt == DT_U32) { @@ -374,7 +374,7 @@ class OclMemory : public Memory { for (U32 i = 0; i < this->length(); i++) { sum += this->element(i) / factor; } - line += " sum: " + std::to_string(sum); + line += " sum:" + std::to_string(sum); } #endif return line; diff --git a/common/memory/include/memory_ocl_img.hpp b/common/memory/include/memory_ocl_img.hpp index 6865aa43..abd6a7f1 100644 --- a/common/memory/include/memory_ocl_img.hpp +++ b/common/memory/include/memory_ocl_img.hpp @@ -127,9 +127,9 @@ class OclMemoryImg : public OclMemory { U8 *tmp = nullptr; if (size < this->desc.byteSize) { if (this->get_mem_type() == OCLMemImg1D) { - U8 *tmp = (U8 *)operator new(this->bytes()); - memset(tmp, 0, this->bytes()); - memcpy(tmp, host_ptr, size); + tmp = (U8 *)UNI_OPERATOR_NEW(this->bytes()); + UNI_MEMSET(tmp, 0, this->bytes()); + UNI_MEMCPY(tmp, host_ptr, size); host_ptr = tmp; } else { CHECK_STATUS(NOT_MATCH); @@ -146,6 +146,9 @@ class OclMemoryImg : public OclMemory { CHECK_STATUS(NOT_SUPPORTED); } } + if (tmp != nullptr) { + UNI_OPERATOR_DELETE(tmp); + } } else { if (!allocated) { this->alloc(); diff --git a/common/memory/include/tensor.hpp b/common/memory/include/tensor.hpp index abd6d20f..37966c27 100644 --- a/common/memory/include/tensor.hpp +++ b/common/memory/include/tensor.hpp @@ -85,6 +85,11 @@ class Tensor { *(this->scale) = scale; } + void set_scale_ptr(std::shared_ptr scale) + { + this->scale = scale; + } + F32 get_scale() { return *(this->scale); @@ -97,7 +102,7 @@ class Tensor { void copy_from(Tensor *other) { - memcpy(this->scale.get(), other->scale.get(), sizeof(F32)); + UNI_MEMCPY(this->scale.get(), other->scale.get(), sizeof(F32)); this->val->copy_from(other->val.get()); } diff --git a/common/memory/include/tensor_common.h b/common/memory/include/tensor_common.h index b0672299..ab912567 100644 --- a/common/memory/include/tensor_common.h +++ b/common/memory/include/tensor_common.h @@ -40,7 +40,7 @@ static void transformToNCHWKernel( case DF_NCHW: { if (in == on && ic == oc && ih == oh && iw == ow) { if (output != input) { - memcpy(output, input, tensorNumBytes(outputDesc)); + UNI_MEMCPY(output, input, tensorNumBytes(outputDesc)); } } else { U32 tileSize = UNI_MIN(iw, ow) * bytesOf(idt); @@ -49,7 +49,7 @@ static void transformToNCHWKernel( for (U32 h = 0; h < oh && h < ih; h++) { U32 srcIndex = ((n * ic + c) * ih + h) * iw; U32 dstIndex = ((n * oc + c) * oh + h) * ow; - memcpy(output + dstIndex, input + srcIndex, tileSize); + UNI_MEMCPY(output + dstIndex, input + srcIndex, tileSize); } } } @@ -169,7 +169,7 @@ static void transformToNHWCKernel( case DF_NHWC: { CHECK_REQUIREMENT(tensorNumElements(inputDesc) == size); if (input != output) { - memcpy(output, input, tensorNumBytes(inputDesc)); + UNI_MEMCPY(output, input, tensorNumBytes(inputDesc)); } break; } @@ -262,9 +262,9 @@ EE transformNCHWToNCHWC8( // support channel padding if (c_i < ic) { U32 srcIndex = (((n * ic + c_i) * ih + h) * iw + w) * elementSize; - memcpy(outputPtr + dstIndex, inputPtr + srcIndex, elementSize); + UNI_MEMCPY(outputPtr + dstIndex, inputPtr + srcIndex, elementSize); } else { - memset(outputPtr + dstIndex, 0, elementSize); + UNI_MEMSET(outputPtr + dstIndex, 0, elementSize); } } } @@ -299,9 +299,9 @@ EE transformNHWCToNCHWC8( // support channel padding if (c_i < ic) { U32 srcIndex = (((n * ih + h) * iw + w) * ic + c_i) * elementSize; - memcpy(outputPtr + dstIndex, inputPtr + srcIndex, elementSize); + UNI_MEMCPY(outputPtr + dstIndex, inputPtr + srcIndex, elementSize); } else { - memset(outputPtr + dstIndex, 0, elementSize); + UNI_MEMSET(outputPtr + dstIndex, 0, elementSize); } } } @@ -318,7 +318,7 @@ EE transformNCHWC8ToNCHWC8ByGroup( U32 outputSize = tensorNumElements(outputDesc); if (group <= 1 || inputSize == outputSize) { if (input != output) { - memcpy(output, input, outputSize); + UNI_MEMCPY(output, input, outputSize); } return SUCCESS; } @@ -354,10 +354,10 @@ EE transformNCHWC8ToNCHWC8ByGroup( U32 srcIndex = ((((n * ict + id_a) * ih + h) * iw + w) * channelAlignSize + id_b) * elementSize; - memcpy( + UNI_MEMCPY( (U8 *)output + dstIndex, (const U8 *)input + srcIndex, elementSize); } else { - memset((U8 *)output + dstIndex, 0, elementSize); + UNI_MEMSET((U8 *)output + dstIndex, 0, elementSize); } } } @@ -417,7 +417,7 @@ EE transposeFilter(TensorDesc inputDesc, const void *input, TensorDesc outputDes for (U32 hw = 0; hw < ih * iw; hw++) { U32 srcIndex = o * ih * iw * innerSize + hw * innerSize; U32 dstIndex = o * ih * iw * innerSize + (hwMax - hw) * innerSize; - memcpy(outputPtr + dstIndex, inputPtr + srcIndex, innerSize); + UNI_MEMCPY(outputPtr + dstIndex, inputPtr + srcIndex, innerSize); } } break; @@ -475,7 +475,7 @@ EE array_transpose(DataType dt, inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1]; } inputIndex += inputLocalIndex[sizeInnerIndex]; - memcpy(outputPtr + i * tileSize, inputPtr + inputIndex * tileSize, tileSize); + UNI_MEMCPY(outputPtr + i * tileSize, inputPtr + inputIndex * tileSize, tileSize); } return SUCCESS; @@ -513,7 +513,7 @@ EE array_transpose_naive(DataType dt, inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1]; } inputIndex += inputLocalIndex[0]; - memcpy(outputPtr + i * tileSize, inputPtr + inputIndex * tileSize, tileSize); + UNI_MEMCPY(outputPtr + i * tileSize, inputPtr + inputIndex * tileSize, tileSize); } return SUCCESS; diff --git a/common/memory/include/tensor_desc.h b/common/memory/include/tensor_desc.h index f3b92a6e..d2353a02 100644 --- a/common/memory/include/tensor_desc.h +++ b/common/memory/include/tensor_desc.h @@ -20,11 +20,14 @@ #include "data_type.h" #include "error.h" +#include "secure_c_wrapper.h" #ifdef _USE_GPU #define CL_TARGET_OPENCL_VERSION 200 #include "CL/cl.h" #endif +#define DIM_LEN 6 + typedef enum { DF_NCHW, DF_NCHWN16, // vectorize for N=16, for filter @@ -68,7 +71,8 @@ typedef enum { DF_NKN12K4, // Optimized MMM filter for INT8 DF_NKNx_NKN32, // Optimized LSTM filter DF_NCHWC16, // vectorize for C=16, for input and output - DF_NCHWC2NxC4 + DF_NCHWC2NxC4, + DF_SCALAR } DataFormat; inline const char *const *DataFormatName() @@ -79,7 +83,8 @@ inline const char *const *DataFormatName() "DF_MKT", "DF_NK", "DF_NKN16", "DF_NKN32", "DF_NKN64", "DF_NKN32K4", "DF_NCHWC4", "DF_NCHWC3", "DF_NHWC", "DF_NCHWN4C4", "DF_NCHWN4", "DF_HWCN", "DF_NCWHN4C4", "DF_NHWCN4", "DF_CHWNC4", "DF_CHWNC8", "DF_CHWNC16", "DF_CHWC8_NCN8", "DF_RGB", "DF_HWNCN8", "DF_NKN24", - "DF_NKN12", "DF_NKN8", "DF_NKN12K4", "DF_NKNx_NKN32", "DF_NCHWC16", "DF_NCHWC2NxC4"}; + "DF_NKN12", "DF_NKN8", "DF_NKN12K4", "DF_NKNx_NKN32", "DF_NCHWC16", "DF_NCHWC2NxC4", + "DF_SCALAR"}; return names; } @@ -87,13 +92,13 @@ typedef struct TensorDesc { DataType dt = DT_U8; DataFormat df = DF_NCHW; U32 nDims = 0; - U32 dims[6] = {0}; + U32 dims[DIM_LEN] = {0}; } TensorDesc; inline TensorDesc tensor0d() { TensorDesc desc; - memset(&desc, 0, sizeof(TensorDesc)); + UNI_MEMSET(&desc, 0, sizeof(TensorDesc)); return desc; } @@ -365,20 +370,38 @@ inline U8 tensorIs5d(TensorDesc desc) return 5 == desc.nDims; } +// in order to support shape calculation, there is a reserved buffer in TensorDesc.dims to save. +inline U8 tensorIsShape(TensorDesc desc) +{ + U32 length = tensorNumElements(desc); + U8 ret = 0; + if (desc.dt == DT_U32 && length > 0 && length + desc.nDims <= DIM_LEN) { + ret = 1; + } + return ret; +} + inline std::string tensorDesc2Str(TensorDesc desc) { std::string descStr = "dt:" + std::string(DataTypeName()[desc.dt]) + " df:" + std::string(DataFormatName()[desc.df]) + " dims:" + std::to_string(desc.nDims); - if (desc.nDims > 0) { descStr += "("; - } - for (I32 i = int(desc.nDims) - 1; i >= 0; i--) { - descStr += std::to_string(desc.dims[i]); - if (i > 0) { - descStr += ","; - } else { - descStr += ")"; + for (I32 i = int(desc.nDims) - 1; i > 0; i--) { + descStr += std::to_string(desc.dims[i]) + ","; + } + descStr += std::to_string(desc.dims[0]) + ")"; + if (tensorIsShape(desc)) { + U32 length = tensorNumElements(desc); + descStr += " reserve:("; + for (U32 i = desc.nDims; i < desc.nDims + length && i < DIM_LEN; i++) { + descStr += std::to_string((int)desc.dims[i]); + if (i + 1 < desc.nDims + length && i + 1 < DIM_LEN) { + descStr += ","; + } else { + descStr += ")"; + } + } } } @@ -387,15 +410,15 @@ inline std::string tensorDesc2Str(TensorDesc desc) inline int tensorDescIsValid(TensorDesc desc) { - if (desc.dt < 0 || desc.dt >= 10) { + if (desc.dt < 0 || desc.dt >= DT_NUM) { return 0; } - if (desc.df < 0 || desc.df >= 30) { + if (desc.df < 0 || desc.df >= 50) { return 0; } - if (desc.nDims > 6) { + if (desc.nDims > DIM_LEN) { return 0; } @@ -427,6 +450,7 @@ inline DataFormat getTensorDefaultDataFormat(int nDims) return df; } +// return format is [w, h, c, n] inline std::vector calculateLocalIndex(U32 index, const U32 *dims, U32 nDims) { std::vector indexes(nDims); @@ -441,7 +465,8 @@ inline U32 calculateGlobalIndex(const U32 *indexes, const U32 *dims, U32 nDims) { U32 index = 0; for (int i = ((int)nDims) - 1; i >= 0; i--) { - index = index * dims[i] + indexes[i]; + U32 value = indexes[i] >= dims[i] ? 0 : indexes[i]; + index = index * dims[i] + value; } return index; } @@ -470,13 +495,13 @@ typedef enum { } GCLMemType; struct GCLMemDesc { - U32 dims[6]; + U32 dims[DIM_LEN]; U32 nDims; DataType dt; DataFormat df; U32 stride[3]; - U32 offset[6]; + U32 offset[DIM_LEN]; GCLMemType memType; DataFormat memFormat; U32 byteSize; diff --git a/common/memory/include/tensor_transpose.h b/common/memory/include/tensor_transpose.h index 5a37ab33..63a097f4 100644 --- a/common/memory/include/tensor_transpose.h +++ b/common/memory/include/tensor_transpose.h @@ -16,10 +16,10 @@ #include "tensor_desc.h" #include "uni.h" -#include "thread_affinity.h" +#include "affinity_policy.h" template -inline static void transformToNCHWKernel( +inline static EE transformToNCHWKernel( TensorDesc inputDesc, const T *input, TensorDesc outputDesc, T *output) { DataType idt, odt; @@ -40,24 +40,30 @@ inline static void transformToNCHWKernel( } else if (tensorIs4d(inputDesc)) { CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); } else { - UNI_ERROR_LOG("not support transform %d-dim tensor to NCHW format\n", (int)inputDesc.nDims); - return; + UNI_ERROR_LOG("not support transform %d-dim tensor to NCHW format.\n", (int)inputDesc.nDims); + return NOT_SUPPORTED; } - if (tensorIs3d(outputDesc)) { + if (tensorIs2d(outputDesc)) { + CHECK_STATUS(tensor2dGet(outputDesc, &odt, &odf, &on, &oc)); + oh = ow = 1; + } else if (tensorIs3d(outputDesc)) { CHECK_STATUS(tensor3dGet(outputDesc, &odt, &odf, &on, &oc, &oh)); ow = 1; } else if (tensorIs4d(outputDesc)) { CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); } else { - UNI_ERROR_LOG("not support transform to %d-dim NCHW tensor\n", (int)outputDesc.nDims); - return; + UNI_ERROR_LOG("not support transform to %d-dim NCHW tensor.\n", (int)outputDesc.nDims); + return NOT_SUPPORTED; } CHECK_REQUIREMENT(idt == odt); + EE ret = SUCCESS; switch (idf) { + case DF_NORMAL: + case DF_MTK: case DF_NCHW: { if (in == on && ic == oc && ih == oh && iw == ow) { if (output != input) { - memcpy(output, input, tensorNumBytes(outputDesc)); + UNI_MEMCPY(output, input, tensorNumBytes(outputDesc)); } } else { U32 tileSize = UNI_MIN(iw, ow) * bytesOf(idt); @@ -66,7 +72,7 @@ inline static void transformToNCHWKernel( for (U32 h = 0; h < oh && h < ih; h++) { U32 srcIndex = ((n * ic + c) * ih + h) * iw; U32 dstIndex = ((n * oc + c) * oh + h) * ow; - memcpy(output + dstIndex, input + srcIndex, tileSize); + UNI_MEMCPY(output + dstIndex, input + srcIndex, tileSize); } } } @@ -160,49 +166,56 @@ inline static void transformToNCHWKernel( break; } default: { - UNI_ERROR_LOG("not support transform %s format to NCHW format\n", DataFormatName()[idf]); + UNI_ERROR_LOG( + "not support transform %s format to NCHW format.\n", DataFormatName()[idf]); + ret = NOT_SUPPORTED; + break; } } + return ret; } inline EE transformToNCHW( TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output) { if (nullptr == input || nullptr == output) { - return NULL_POINTER; + CHECK_STATUS(NULL_POINTER); } + EE ret = NOT_SUPPORTED; switch (inputDesc.dt) { #ifdef _USE_FP32 case DT_F32: { - transformToNCHWKernel(inputDesc, (F32 *)input, outputDesc, (F32 *)output); + ret = transformToNCHWKernel(inputDesc, (F32 *)input, outputDesc, (F32 *)output); break; } #endif #ifdef _USE_FP16 case DT_F16: { - transformToNCHWKernel(inputDesc, (F16 *)input, outputDesc, (F16 *)output); + ret = transformToNCHWKernel(inputDesc, (F16 *)input, outputDesc, (F16 *)output); break; } #endif #ifdef _USE_INT8 case DT_I8: { - transformToNCHWKernel(inputDesc, (INT8 *)input, outputDesc, (INT8 *)output); + ret = transformToNCHWKernel(inputDesc, (INT8 *)input, outputDesc, (INT8 *)output); break; } case DT_U8_Q: { - transformToNCHWKernel(inputDesc, (UINT8 *)input, outputDesc, (UINT8 *)output); + ret = transformToNCHWKernel( + inputDesc, (UINT8 *)input, outputDesc, (UINT8 *)output); break; } #endif default: { - return NOT_SUPPORTED; + UNI_ERROR_LOG("not support transform %s type tensor.\n", DataTypeName()[inputDesc.dt]); + break; } } - return SUCCESS; + return ret; } template -inline static void transformToNHWCKernel( +inline static EE transformToNHWCKernel( TensorDesc inputDesc, const T *input, TensorDesc outputDesc, T *output) { DataType idt, odt; @@ -219,19 +232,27 @@ inline static void transformToNHWCKernel( CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); } else { UNI_ERROR_LOG("not support transform %d-dim tensor to NHWC format\n", (int)inputDesc.nDims); - return; + return NOT_SUPPORTED; + } + if (tensorIs4d(outputDesc)) { + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + } else { + UNI_ERROR_LOG("not support transform to %d-dim NHWC tensor.\n", (int)outputDesc.nDims); + return NOT_SUPPORTED; } - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); U32 size = tensorNumElements(outputDesc); U32 ihiw = ih * iw; + EE ret = SUCCESS; switch (idf) { case DF_NHWC: { CHECK_REQUIREMENT(tensorNumElements(inputDesc) == size); if (input != output) { - memcpy(output, input, tensorNumBytes(inputDesc)); + UNI_MEMCPY(output, input, tensorNumBytes(inputDesc)); } break; } + case DF_NORMAL: + case DF_MTK: case DF_NCHW: { CHECK_REQUIREMENT(tensorNumElements(inputDesc) == size); for (U32 o = 0, srcIndex = 0; o < in; o++) { @@ -244,14 +265,16 @@ inline static void transformToNHWCKernel( } break; } - case DF_NCHWC8: { - CHECK_REQUIREMENT(ic % 8 == 0); - ic /= 8; + case DF_NCHWC8: + case DF_NCHWC16: { + U32 align = (idf == DF_NCHWC16) ? 16 : 8; + CHECK_REQUIREMENT(ic % align == 0); + ic /= align; for (U32 n = 0, srcIndex = 0; n < in; n++) { for (U32 c = 0; c < ic; c++) { for (U32 hw = 0; hw < ihiw; hw++) { - for (U32 c8 = 0; c8 < 8; c8++, srcIndex++) { - U32 dstIndex = ((n * ihiw + hw) * ic + c) * 8 + c8; + for (U32 cx = 0; cx < align; cx++, srcIndex++) { + U32 dstIndex = ((n * ihiw + hw) * ic + c) * align + cx; output[dstIndex] = input[srcIndex]; } } @@ -262,8 +285,11 @@ inline static void transformToNHWCKernel( default: { UNI_ERROR_LOG( "not support transform %s format tensor to NHWC format\n", DataFormatName()[idf]); + ret = NOT_SUPPORTED; + break; } } + return ret; } inline EE transformToNHWC( @@ -272,30 +298,32 @@ inline EE transformToNHWC( if (nullptr == input || nullptr == output) { return NULL_POINTER; } + EE ret = NOT_SUPPORTED; switch (inputDesc.dt) { #ifdef _USE_FP32 case DT_F32: { - transformToNHWCKernel(inputDesc, (F32 *)input, outputDesc, (F32 *)output); + ret = transformToNHWCKernel(inputDesc, (F32 *)input, outputDesc, (F32 *)output); break; } #endif #ifdef _USE_FP16 case DT_F16: { - transformToNHWCKernel(inputDesc, (F16 *)input, outputDesc, (F16 *)output); + ret = transformToNHWCKernel(inputDesc, (F16 *)input, outputDesc, (F16 *)output); break; } #endif #ifdef _USE_INT8 case DT_I8: { - transformToNHWCKernel(inputDesc, (INT8 *)input, outputDesc, (INT8 *)output); + ret = transformToNHWCKernel(inputDesc, (INT8 *)input, outputDesc, (INT8 *)output); break; } #endif default: { - return NOT_SUPPORTED; + UNI_ERROR_LOG("not support transform %s type tensor.\n", DataTypeName()[inputDesc.dt]); + break; } } - return SUCCESS; + return ret; } inline EE transformNCHWC16ToNCHWC8( @@ -309,7 +337,7 @@ inline EE transformNCHWC16ToNCHWC8( U32 in, ic, ih, iw, on, oc, oh, ow; if (tensorIs2d(inputDesc)) { if (input != output) { - memcpy(output, input, tensorNumBytes(inputDesc)); + UNI_MEMCPY(output, input, tensorNumBytes(inputDesc)); } return SUCCESS; } else if (tensorIs3d(inputDesc)) { @@ -333,7 +361,7 @@ inline EE transformNCHWC16ToNCHWC8( U32 srcIndex = n * ic * ih * iw + c * ih * iw * 8 + (h * iw + w) * 16 + c8 * 8; U32 dstIndex = n * ic * ih * iw + (c + c8) * ih * iw * 8 + (h * iw + w) * 8; - memcpy(outputPtr + dstIndex * elementSize, + UNI_MEMCPY(outputPtr + dstIndex * elementSize, inputPtr + srcIndex * elementSize, elementSize * 8); } } @@ -354,7 +382,7 @@ inline EE transformNCHWToNCHWC8( U32 in, ic, ih, iw, on, oc, oh, ow; if (tensorIs2d(inputDesc)) { if (input != output) { - memcpy(output, input, tensorNumBytes(inputDesc)); + UNI_MEMCPY(output, input, tensorNumBytes(inputDesc)); } return SUCCESS; } else if (tensorIs3d(inputDesc)) { @@ -379,9 +407,9 @@ inline EE transformNCHWToNCHWC8( // support channel padding if (c_i < ic) { U32 srcIndex = (((n * ic + c_i) * ih + h) * iw + w) * elementSize; - memcpy(outputPtr + dstIndex, inputPtr + srcIndex, elementSize); + UNI_MEMCPY(outputPtr + dstIndex, inputPtr + srcIndex, elementSize); } else { - memset(outputPtr + dstIndex, 0, elementSize); + UNI_MEMSET(outputPtr + dstIndex, 0, elementSize); } } } @@ -416,9 +444,9 @@ inline EE transformNHWCToNCHWC8( // support channel padding if (c_i < ic) { U32 srcIndex = (((n * ih + h) * iw + w) * ic + c_i) * elementSize; - memcpy(outputPtr + dstIndex, inputPtr + srcIndex, elementSize); + UNI_MEMCPY(outputPtr + dstIndex, inputPtr + srcIndex, elementSize); } else { - memset(outputPtr + dstIndex, 0, elementSize); + UNI_MEMSET(outputPtr + dstIndex, 0, elementSize); } } } @@ -435,7 +463,7 @@ inline EE transformNCHWC8ToNCHWC8ByGroup( U32 outputSize = tensorNumElements(outputDesc); if (group <= 1 || inputSize == outputSize) { if (input != output) { - memcpy(output, input, outputSize); + UNI_MEMCPY(output, input, outputSize); } return SUCCESS; } @@ -471,10 +499,10 @@ inline EE transformNCHWC8ToNCHWC8ByGroup( U32 srcIndex = ((((n * ict + id_a) * ih + h) * iw + w) * channelAlignSize + id_b) * elementSize; - memcpy( + UNI_MEMCPY( (U8 *)output + dstIndex, (const U8 *)input + srcIndex, elementSize); } else { - memset((U8 *)output + dstIndex, 0, elementSize); + UNI_MEMSET((U8 *)output + dstIndex, 0, elementSize); } } } @@ -485,7 +513,7 @@ inline EE transformNCHWC8ToNCHWC8ByGroup( } template -inline static void transformToNCHWC16Kernel( +inline static EE transformToNCHWC16Kernel( TensorDesc inputDesc, const T *input, TensorDesc outputDesc, T *output) { DataType idt, odt; @@ -508,7 +536,7 @@ inline static void transformToNCHWC16Kernel( } else { UNI_ERROR_LOG( "not support transform %d-dim tensor to NCHWC16 format\n", (int)inputDesc.nDims); - return; + return NOT_SUPPORTED; } if (tensorIs3d(outputDesc)) { CHECK_STATUS(tensor3dGet(outputDesc, &odt, &odf, &on, &oc, &oh)); @@ -517,10 +545,12 @@ inline static void transformToNCHWC16Kernel( CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); } else { UNI_ERROR_LOG("not support transform to %d-dim NCHWC16 tensor\n", (int)outputDesc.nDims); - return; + return NOT_SUPPORTED; } CHECK_REQUIREMENT(idt == odt); + EE ret = SUCCESS; switch (idf) { + case DF_NORMAL: case DF_MTK: case DF_NCHW: { U32 ic16 = ic / 16; @@ -593,8 +623,11 @@ inline static void transformToNCHWC16Kernel( default: { UNI_ERROR_LOG( "not support transform %s format to NCHWC16 format\n", DataFormatName()[idf]); + ret = NOT_SUPPORTED; + break; } } + return ret; } inline EE transformToNCHWC16( @@ -603,37 +636,40 @@ inline EE transformToNCHWC16( if (nullptr == input || nullptr == output) { return NULL_POINTER; } + EE ret = NOT_SUPPORTED; switch (inputDesc.dt) { #ifdef _USE_FP32 case DT_F32: { - transformToNCHWC16Kernel(inputDesc, (F32 *)input, outputDesc, (F32 *)output); + ret = transformToNCHWC16Kernel(inputDesc, (F32 *)input, outputDesc, (F32 *)output); break; } #endif #ifdef _USE_INT8 case DT_U8_Q: { - transformToNCHWC16Kernel(inputDesc, (UINT8 *)input, outputDesc, (UINT8 *)output); + ret = transformToNCHWC16Kernel( + inputDesc, (UINT8 *)input, outputDesc, (UINT8 *)output); break; } #endif default: { - return NOT_SUPPORTED; + UNI_ERROR_LOG("not support transform %s type tensor.\n", DataTypeName()[inputDesc.dt]); + break; } } - return SUCCESS; + return ret; } inline EE transformFormat( TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output) { EE ret = NOT_SUPPORTED; - if (outputDesc.df == DF_NCHW) { + if (outputDesc.df == DF_NCHW || outputDesc.df == DF_MTK || outputDesc.df == DF_NORMAL) { ret = transformToNCHW(inputDesc, input, outputDesc, output); } else if (outputDesc.df == DF_NCHWC8) { if (inputDesc.df == DF_NORMAL) { - memcpy(output, input, tensorNumBytes(inputDesc)); + UNI_MEMCPY(output, input, tensorNumBytes(inputDesc)); ret = SUCCESS; - } else if (inputDesc.df == DF_NCHW || inputDesc.df == DF_MTK) { + } else if (inputDesc.df == DF_NCHW || inputDesc.df == DF_MTK || inputDesc.df == DF_NORMAL) { ret = transformNCHWToNCHWC8(inputDesc, input, outputDesc, output); } else if (inputDesc.df == DF_NHWC) { ret = transformNHWCToNCHWC8(inputDesc, input, outputDesc, output); @@ -648,6 +684,8 @@ inline EE transformFormat( } } else if (outputDesc.df == DF_NCHWC16) { ret = transformToNCHWC16(inputDesc, input, outputDesc, output); + } else if (outputDesc.df == DF_NHWC) { + ret = transformToNHWC(inputDesc, input, outputDesc, output); } else { UNI_ERROR_LOG("layout transpose can not support transform to %s format.\n", DataFormatName()[outputDesc.df]); @@ -664,34 +702,39 @@ inline EE transposeFilter( DataType idt, odt; DataFormat idf, odf; U32 in, ic, ih, iw, on, oc, oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + if (tensorIs4d(inputDesc) && tensorIs4d(outputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + } else { + UNI_ERROR_LOG("currently only support to transpose 4-dim filter.\n"); + return NOT_SUPPORTED; + } CHECK_REQUIREMENT(idf == odf); - const U8 *inputPtr = (const U8 *)input; - U8 *outputPtr = (U8 *)output; - + const U8 *src = (const U8 *)input; + U8 *dst = (U8 *)output; + EE ret = SUCCESS; switch (idf) { case DF_NHWCN8: { CHECK_REQUIREMENT(in % 8 == 0); in /= 8; U32 hwMax = ih * iw - 1; - U32 innerSize = bytesOf(idt) * ic * 8; - for (U32 o = 0; o < in; o++) { for (U32 hw = 0; hw < ih * iw; hw++) { U32 srcIndex = o * ih * iw * innerSize + hw * innerSize; U32 dstIndex = o * ih * iw * innerSize + (hwMax - hw) * innerSize; - memcpy(outputPtr + dstIndex, inputPtr + srcIndex, innerSize); + UNI_MEMCPY(dst + dstIndex, src + srcIndex, innerSize); } } break; } default: { - CHECK_STATUS(NOT_SUPPORTED); + UNI_ERROR_LOG( + "currently not support to transpose %s format filter.\n", DataFormatName()[idf]); + ret = NOT_SUPPORTED; + break; } } - return SUCCESS; + return ret; } - #endif diff --git a/common/model_spec/include/model_common.h b/common/model_spec/include/model_common.h index 264f618f..b5b255e9 100644 --- a/common/model_spec/include/model_common.h +++ b/common/model_spec/include/model_common.h @@ -16,10 +16,33 @@ #include #include "model_spec.h" +#include "memory_cpu.h" EE str_copy(I8 *dst, const I8 *src, I32 src_len, I32 dst_len = NAME_LEN); -void *mt_new_storage(size_t size); +inline void *mt_malloc(U32 size) +{ + return UNI_OPERATOR_NEW(size); +} + +template +inline void mt_free(T *&p) +{ + UNI_OPERATOR_DELETE(p); + p = nullptr; +} + +// only WeightSpec's weight and vec varialbles free by using this. +// because this will use mmap memory. +template +inline void mt_free(T *&p, ModelSpec *spec) +{ + if (spec == nullptr || spec->mfd == nullptr || (uintptr_t(p) < uintptr_t(spec->mfd->bytes)) || + (uintptr_t(p) >= uintptr_t(spec->mfd->bytes + spec->mfd->fileLength))) { + UNI_OPERATOR_DELETE(p); + } + p = nullptr; +} OperatorSpec mt_create_operator( const char *name, OperatorType type, U32 num_inputs, U32 num_outputs); @@ -34,4 +57,7 @@ bool isDeprecatedOp(OperatorType opType); bool isDeprecatedOpWeight(const ModelSpec *spec, int index); std::string concat_dir_file(std::string dir, std::string file); + +void modify_ms_inputs_and_outputs( + ModelSpec *ms, std::string modifiedInputs, std::string modifiedOutputs); #endif diff --git a/common/model_spec/include/model_spec.h b/common/model_spec/include/model_spec.h index 3df6008f..121c79e4 100644 --- a/common/model_spec/include/model_spec.h +++ b/common/model_spec/include/model_spec.h @@ -16,7 +16,7 @@ #include "parameter_spec.h" -static const int sg_boltVersion = 20201120; +static const int sg_boltVersion = 20220126; static const int sg_magicNumber = 1141119; #pragma pack(8) @@ -87,14 +87,10 @@ typedef struct { } ModelSpec; #pragma pack() -#define outOfFileMapRange(addr, mfd) \ - ((mfd == nullptr) || (uintptr_t(addr) < uintptr_t(mfd->bytes)) || \ - (uintptr_t(addr) >= uintptr_t(mfd->bytes + mfd->fileLength))) - -EE mt_create_model(ModelSpec *md); +EE mt_create_model(ModelSpec *spec); EE serialize_model_to_file(const ModelSpec *spec, const char *fn); EE deserialize_model_from_file(const char *fn, ModelSpec *spec, bool useFileStream = false); -EE mt_destroy_model(ModelSpec *md); +EE mt_destroy_model(ModelSpec *spec); #include "model_print.h" #endif diff --git a/common/model_spec/src/CMakeLists.txt b/common/model_spec/src/CMakeLists.txt index e6efbc94..d610b2d6 100644 --- a/common/model_spec/src/CMakeLists.txt +++ b/common/model_spec/src/CMakeLists.txt @@ -3,6 +3,9 @@ file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) # shared library add_library(${PROJECT_NAME} SHARED ${srcs}) target_link_libraries(${PROJECT_NAME} LINK_PUBLIC uni) +if (USE_SECURE_C) + target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${SecureC_SHARED_LIBRARY}) +endif () # static library add_library(${PROJECT_NAME}_static STATIC ${srcs}) diff --git a/common/model_spec/src/model_common.cpp b/common/model_spec/src/model_common.cpp index 1dee4ae8..96b24691 100644 --- a/common/model_spec/src/model_common.cpp +++ b/common/model_spec/src/model_common.cpp @@ -17,7 +17,7 @@ OperatorSpec mt_create_operator(const char *name, OperatorType type, U32 num_inputs, U32 num_outputs) { OperatorSpec newOperator; - memset(&(newOperator), 0, sizeof(OperatorSpec)); + UNI_MEMSET(&(newOperator), 0, sizeof(OperatorSpec)); U32 length = UNI_MIN(strlen(name), NAME_LEN - 1); str_copy(newOperator.name, name, length); if (length < NAME_LEN) { @@ -25,14 +25,14 @@ OperatorSpec mt_create_operator(const char *name, OperatorType type, U32 num_inp } newOperator.type = type; newOperator.num_inputs = num_inputs; - newOperator.input_tensors_name = (I8 **)mt_new_storage(num_inputs * sizeof(I8 *)); + newOperator.input_tensors_name = (I8 **)mt_malloc(num_inputs * sizeof(I8 *)); for (U32 i = 0; i < num_inputs; i++) { - newOperator.input_tensors_name[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + newOperator.input_tensors_name[i] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8)); } newOperator.num_outputs = num_outputs; - newOperator.output_tensors_name = (I8 **)mt_new_storage(num_outputs * sizeof(I8 *)); + newOperator.output_tensors_name = (I8 **)mt_malloc(num_outputs * sizeof(I8 *)); for (U32 i = 0; i < num_outputs; i++) { - newOperator.output_tensors_name[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + newOperator.output_tensors_name[i] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8)); } newOperator.tensor_positions = NULL; newOperator.num_quant_feature = 0; @@ -46,7 +46,7 @@ EE mt_insert_operator(ModelSpec *ms, int index, OperatorSpec newOperator) return NULL_POINTER; } OperatorSpec *operatorList = - (OperatorSpec *)mt_new_storage(sizeof(OperatorSpec) * (ms->num_operator_specs + 1)); + (OperatorSpec *)mt_malloc(sizeof(OperatorSpec) * (ms->num_operator_specs + 1)); for (int i = 0; i < index; i++) { operatorList[i] = ms->ops[i]; } @@ -54,7 +54,7 @@ EE mt_insert_operator(ModelSpec *ms, int index, OperatorSpec newOperator) for (int i = index; i < ms->num_operator_specs; i++) { operatorList[i + 1] = ms->ops[i]; } - delete ms->ops; + mt_free(ms->ops); ms->ops = operatorList; ms->num_operator_specs++; return SUCCESS; @@ -64,7 +64,7 @@ WeightSpec mt_create_weight( const char *name, DataType dataType, U32 bytesOfWeight, U32 bytesOfVec, U32 numQuantScale) { WeightSpec newWeight; - memset(&(newWeight), 0, sizeof(WeightSpec)); + UNI_MEMSET(&(newWeight), 0, sizeof(WeightSpec)); U32 length = UNI_MIN(strlen(name), NAME_LEN - 1); str_copy(newWeight.op_name, name, length); if (length < NAME_LEN) { @@ -72,11 +72,11 @@ WeightSpec mt_create_weight( } newWeight.mdt = dataType; newWeight.bytes_of_weight = bytesOfWeight; - newWeight.weight = (U8 *)mt_new_storage(bytesOfWeight); + newWeight.weight = (U8 *)mt_malloc(bytesOfWeight); newWeight.bytes_of_vec = bytesOfVec; - newWeight.vec = (U8 *)mt_new_storage(bytesOfVec); + newWeight.vec = (U8 *)mt_malloc(bytesOfVec); newWeight.num_quant_scale = numQuantScale; - newWeight.weight_scale = (QuantSpec *)mt_new_storage(sizeof(QuantSpec) * numQuantScale); + newWeight.weight_scale = (QuantSpec *)mt_malloc(sizeof(QuantSpec) * numQuantScale); return newWeight; } @@ -100,31 +100,18 @@ bool isDeprecatedOpWeight(const ModelSpec *spec, int index) EE str_copy(I8 *dst, const I8 *src, I32 srcLen, I32 dstLen) { - //memset(dst, 0, dstLen); + //UNI_MEMSET(dst, 0, dstLen); //I32 copyLen = UNI_MIN(srcLen, dstLen); - //memcpy(dst, src, copyLen); - memset(dst, 0, dstLen); + //UNI_MEMCPY(dst, src, copyLen); + UNI_MEMSET(dst, 0, dstLen); I32 copyLen = NAME_LEN - 1; if (copyLen > srcLen) { copyLen = srcLen; } - memcpy(dst, src, copyLen * sizeof(I8)); + UNI_MEMCPY(dst, src, copyLen * sizeof(I8)); return SUCCESS; } -void *mt_new_storage(size_t size) -{ - void *ret = nullptr; - if (size > 0) { - try { - ret = operator new(size); - } catch (const std::bad_alloc &e) { - UNI_ERROR_LOG("%s alloc %d bytes failed\n", __FUNCTION__, (int)size); - } - } - return ret; -} - std::string concat_dir_file(std::string dir, std::string file) { std::string ret; @@ -143,3 +130,66 @@ std::string concat_dir_file(std::string dir, std::string file) return ret; } + +std::vector string_parser(std::string s, std::string delimiter) +{ + std::vector res; + size_t pos = 0; + std::string token; + while ((pos = s.find(delimiter)) != std::string::npos) { + token = s.substr(0, pos); + res.push_back(token); + s.erase(0, pos + delimiter.length()); + } + res.push_back(s); + return res; +} + +void modify_ms_inputs_and_outputs( + ModelSpec *ms, std::string modifiedInputs, std::string modifiedOutputs) +{ + std::map modifiedStrMap; + if (modifiedInputs.length() > 0) { + std::vector modified_input_names = string_parser(modifiedInputs, ","); + if ((I32)(modified_input_names.size()) != ms->num_inputs) { + UNI_ERROR_LOG("input names not match, please check your params meticulously.\n"); + } + for (int i = 0; i < ms->num_inputs; i++) { + std::string tmpStr = modified_input_names[i]; + modifiedStrMap[std::string(ms->input_names[i])] = tmpStr; + str_copy(ms->input_names[i], tmpStr.c_str(), tmpStr.length()); + } + } + if (modifiedOutputs.length() > 0) { + std::vector modified_output_names = string_parser(modifiedOutputs, ","); + if ((I32)(modified_output_names.size()) != ms->num_outputs) { + UNI_ERROR_LOG("output names not match, please check your params meticulously.\n"); + } + for (int i = 0; i < ms->num_outputs; i++) { + std::string tmpStr = modified_output_names[i]; + modifiedStrMap[std::string(ms->output_names[i])] = tmpStr; + str_copy(ms->output_names[i], tmpStr.c_str(), tmpStr.length()); + } + } + + if (modifiedStrMap.size() > 0) { + for (I32 i = 0; i < ms->num_operator_specs; i++) { + for (U32 j = 0; j < ms->ops[i].num_inputs; j++) { + std::string curStr = std::string(ms->ops[i].input_tensors_name[j]); + if (modifiedStrMap.find(curStr) != modifiedStrMap.end()) { + std::string modifiedStr = modifiedStrMap[curStr]; + str_copy(ms->ops[i].input_tensors_name[j], modifiedStr.c_str(), + modifiedStr.length()); + } + } + for (U32 j = 0; j < ms->ops[i].num_outputs; j++) { + std::string curStr = std::string(ms->ops[i].output_tensors_name[j]); + if (modifiedStrMap.find(curStr) != modifiedStrMap.end()) { + std::string modifiedStr = modifiedStrMap[curStr]; + str_copy(ms->ops[i].output_tensors_name[j], modifiedStr.c_str(), + modifiedStr.length()); + } + } + } + } +} diff --git a/common/model_spec/src/model_deserialize.cpp b/common/model_spec/src/model_deserialize.cpp index 13f8bcbb..8388c5a9 100644 --- a/common/model_spec/src/model_deserialize.cpp +++ b/common/model_spec/src/model_deserialize.cpp @@ -128,16 +128,16 @@ EE operator_relationship(ModelSpec *spec) int opNum = spec->num_operator_specs; spec->num_op_tensor_entries = opNum; OperatorSpec *opsPtr2 = spec->ops; - OperatorRelationshipMapEntry *oprmePtr = (OperatorRelationshipMapEntry *)mt_new_storage( - sizeof(OperatorRelationshipMapEntry) * opNum); + OperatorRelationshipMapEntry *oprmePtr = + (OperatorRelationshipMapEntry *)mt_malloc(sizeof(OperatorRelationshipMapEntry) * opNum); spec->op_relationship_entries = oprmePtr; for (int j = 0; j < opNum; j++) { str_copy(oprmePtr[j].op, opsPtr2[j].name, NAME_LEN); int opInOpNum = opInTensorNew[opsPtr2[j].name].size(); oprmePtr[j].num_inputs = opInOpNum; - oprmePtr[j].input_op_names = (I8 **)mt_new_storage(opInOpNum * sizeof(I8 *)); + oprmePtr[j].input_op_names = (I8 **)mt_malloc(opInOpNum * sizeof(I8 *)); for (int k = 0; k < opInOpNum; k++) { - oprmePtr[j].input_op_names[k] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + oprmePtr[j].input_op_names[k] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8)); std::string ten_name = opInTensorNew[opsPtr2[j].name][k]; std::string tensor2op = tensorOpMapping[ten_name]; str_copy(oprmePtr[j].input_op_names[k], tensor2op.c_str(), tensor2op.length()); @@ -145,9 +145,9 @@ EE operator_relationship(ModelSpec *spec) int opOutOpNum = tensorFlowsToOpSet[opOutTensorNew[opsPtr2[j].name]].size(); oprmePtr[j].num_outputs = opOutOpNum; - oprmePtr[j].output_op_names = (I8 **)mt_new_storage(opOutOpNum * sizeof(I8 *)); + oprmePtr[j].output_op_names = (I8 **)mt_malloc(opOutOpNum * sizeof(I8 *)); for (int k = 0; k < opOutOpNum; k++) { - oprmePtr[j].output_op_names[k] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + oprmePtr[j].output_op_names[k] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8)); std::string tensor2op = tensorFlowsToOpSet[opOutTensorNew[opsPtr2[j].name]][k]; str_copy(oprmePtr[j].output_op_names[k], tensor2op.c_str(), tensor2op.length()); } @@ -163,11 +163,11 @@ void dequantize_int8_weight(int num, F32 scale, INT8 *q, T *d) int base = -127; for (int i = 0; i < 255; i++) { F32 value = factor * base; -#ifndef __aarch64__ +#ifndef _USE_FP16 if (dt != DT_F16) { #endif table[i] = value; -#ifndef __aarch64__ +#ifndef _USE_FP16 } else { transformFromFloat(DT_F16, &value, table + i, 1); } @@ -184,7 +184,7 @@ template inline void deserialize_field(const char **buffer, U32 *position, T *element, int length = 1) { int size = length * sizeof(T); - memcpy(element, *buffer, size); + UNI_MEMCPY(element, *buffer, size); *buffer += size; *position += size; } @@ -196,18 +196,20 @@ EE deserialize_header(const char *bytes, ModelSpec *spec, U32 *pos) deserialize_field(pointer, pos, &spec->version); if (spec->version != sg_boltVersion) { - UNI_ERROR_LOG("X2bolt version is [%d], but your model version is : [%d].\n Please update " - "X2bolt to version[%d].\n", - sg_boltVersion, spec->version, spec->version); - CHECK_STATUS(NOT_MATCH); + UNI_WARNING_LOG("The read model module version(%d) of the library should match the model " + "file of the same version, but your model version is %d. This may " + "encounter error.\nPlease use another library or reconverter model.\n", + sg_boltVersion, spec->version); + } + if (spec->version < 20201120) { + UNI_ERROR_LOG("This library can not read model with version(%d),\n", spec->version); return NOT_MATCH; } deserialize_field(pointer, pos, &spec->magic_number); if (spec->magic_number != sg_magicNumber) { - UNI_ERROR_LOG( - "magic_number not_match: code %d bolt model %d\n", sg_magicNumber, spec->magic_number); - CHECK_STATUS(NOT_MATCH); + UNI_ERROR_LOG("magic number not match: library is %d, bolt model is %d\n", sg_magicNumber, + spec->magic_number); return NOT_MATCH; } @@ -215,18 +217,18 @@ EE deserialize_header(const char *bytes, ModelSpec *spec, U32 *pos) deserialize_field(pointer, pos, &spec->dt); deserialize_field(pointer, pos, &spec->num_inputs); - spec->input_names = (I8 **)mt_new_storage(spec->num_inputs * sizeof(I8 *)); - spec->input_dims = (TensorDesc *)mt_new_storage(spec->num_inputs * sizeof(TensorDesc)); + spec->input_names = (I8 **)mt_malloc(spec->num_inputs * sizeof(I8 *)); + spec->input_dims = (TensorDesc *)mt_malloc(spec->num_inputs * sizeof(TensorDesc)); for (int i = 0; i < spec->num_inputs; i++) { - spec->input_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + spec->input_names[i] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8)); deserialize_field(pointer, pos, spec->input_names[i], NAME_LEN); } deserialize_field(pointer, pos, spec->input_dims, spec->num_inputs); deserialize_field(pointer, pos, &spec->num_outputs); - spec->output_names = (I8 **)mt_new_storage(spec->num_outputs * NAME_LEN); + spec->output_names = (I8 **)mt_malloc(spec->num_outputs * NAME_LEN); for (int i = 0; i < spec->num_outputs; i++) { - spec->output_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + spec->output_names[i] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8)); deserialize_field(pointer, pos, spec->output_names[i], NAME_LEN); } return SUCCESS; @@ -238,43 +240,57 @@ EE deserialize_operator(const char *bytes, ModelSpec *spec, U32 *pos) const char **pointer = &operator_pointer; deserialize_field(pointer, pos, &spec->num_operator_specs); - spec->ops = (OperatorSpec *)mt_new_storage(spec->num_operator_specs * sizeof(OperatorSpec)); + spec->ops = (OperatorSpec *)mt_malloc(spec->num_operator_specs * sizeof(OperatorSpec)); OperatorSpec *ptr = spec->ops; for (int i = 0; i < spec->num_operator_specs; i++) { deserialize_field(pointer, pos, ptr[i].name, NAME_LEN); deserialize_field(pointer, pos, &ptr[i].type); deserialize_field(pointer, pos, &ptr[i].num_inputs); - ptr[i].input_tensors_name = (I8 **)mt_new_storage(ptr[i].num_inputs * sizeof(I8 *)); + ptr[i].input_tensors_name = (I8 **)mt_malloc(ptr[i].num_inputs * sizeof(I8 *)); for (U32 j = 0; j < ptr[i].num_inputs; j++) { - ptr[i].input_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + ptr[i].input_tensors_name[j] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8)); deserialize_field(pointer, pos, ptr[i].input_tensors_name[j], NAME_LEN); } deserialize_field(pointer, pos, &ptr[i].num_outputs); - ptr[i].output_tensors_name = (I8 **)mt_new_storage(ptr[i].num_outputs * sizeof(I8 *)); + ptr[i].output_tensors_name = (I8 **)mt_malloc(ptr[i].num_outputs * sizeof(I8 *)); for (U32 j = 0; j < ptr[i].num_outputs; j++) { - ptr[i].output_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8)); + ptr[i].output_tensors_name[j] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8)); deserialize_field(pointer, pos, ptr[i].output_tensors_name[j], NAME_LEN); } U32 numTensors = ptr[i].num_inputs + ptr[i].num_outputs; - ptr[i].tensor_positions = (I32 *)mt_new_storage(numTensors * sizeof(I32)); + ptr[i].tensor_positions = (I32 *)mt_malloc(numTensors * sizeof(I32)); deserialize_field(pointer, pos, ptr[i].tensor_positions, numTensors); deserialize_field(pointer, pos, &ptr[i].num_quant_feature); - ptr[i].feature_scale = - (QuantSpec *)mt_new_storage(ptr[i].num_quant_feature * sizeof(QuantSpec)); + ptr[i].feature_scale = (QuantSpec *)mt_malloc(ptr[i].num_quant_feature * sizeof(QuantSpec)); for (U32 j = 0; j < ptr[i].num_quant_feature; j++) { deserialize_field(pointer, pos, &(ptr[i].feature_scale[j].num_scale)); ptr[i].feature_scale[j].scale = - (F32 *)mt_new_storage(ptr[i].feature_scale[j].num_scale * sizeof(F32)); + (F32 *)mt_malloc(ptr[i].feature_scale[j].num_scale * sizeof(F32)); deserialize_field( pointer, pos, ptr[i].feature_scale[j].scale, ptr[i].feature_scale[j].num_scale); } - deserialize_field( - pointer, pos, (U8 *)&(ptr[i].ps), get_operator_parameter_size(ptr[i].type)); + deserialize_field(pointer, pos, (U8 *)&(ptr[i].ps), + get_operator_parameter_size(spec->version, ptr[i].type)); + if (spec->version == 20201120) { + if (ptr[i].type == OT_Conv || ptr[i].type == OT_Deconvolution) { + ptr[i].ps.conv_spec.output_pad_t = 0; + ptr[i].ps.conv_spec.output_pad_h = 0; + ptr[i].ps.conv_spec.output_pad_w = 0; + } + if (ptr[i].type == OT_LayerNorm) { + ptr[i].ps.ln_spec.axis = -1; + } + } + if (spec->version == 20201120 || spec->version == 20211021) { + if (ptr[i].type == OT_Transpose) { + ptr[i].ps.transpose_spec.df = DF_NCHW; + } + } } return SUCCESS; } @@ -285,7 +301,7 @@ EE deserialize_weight(const char *bytes, ModelSpec *spec, U32 *pos) const char **pointer = &weight_pointer; deserialize_field(pointer, pos, &spec->num_weight_specs); - spec->ws = (WeightSpec *)mt_new_storage(spec->num_weight_specs * sizeof(WeightSpec)); + spec->ws = (WeightSpec *)mt_malloc(spec->num_weight_specs * sizeof(WeightSpec)); WeightSpec *ptr = spec->ws; for (int i = 0; i < spec->num_weight_specs; i++) { U32 length = 0, count = 0; @@ -296,17 +312,19 @@ EE deserialize_weight(const char *bytes, ModelSpec *spec, U32 *pos) bool quantFP16 = false; bool quantInt8 = false; - if (DT_F16 == ptr[i].mdt && DT_F32 == spec->dt) { - ptr[i].mdt = DT_F32; - quantFP16 = true; - } else if (DT_I8 == ptr[i].mdt && DT_I8 != spec->dt) { - if (spec->dt == DT_F16_8Q) { - ptr[i].mdt = DT_F16; - } else if (spec->dt == DT_F32_8Q) { - ptr[i].mdt = DT_F32; - } else { - ptr[i].mdt = spec->dt; + if (DT_F32 == spec->dt) { + if (ptr[i].mdt == DT_F16) { + quantFP16 = true; + } + if (ptr[i].mdt == DT_I8) { + quantInt8 = true; } + ptr[i].mdt = DT_F32; + } else if (DT_F16_8Q == ptr[i].mdt) { + ptr[i].mdt = DT_F16; + quantInt8 = true; + } else if (DT_F32_8Q == ptr[i].mdt) { + ptr[i].mdt = DT_F32; quantInt8 = true; } @@ -338,12 +356,11 @@ EE deserialize_weight(const char *bytes, ModelSpec *spec, U32 *pos) } deserialize_field(pointer, pos, &ptr[i].num_quant_scale); - ptr[i].weight_scale = - (QuantSpec *)mt_new_storage(ptr[i].num_quant_scale * sizeof(QuantSpec)); + ptr[i].weight_scale = (QuantSpec *)mt_malloc(ptr[i].num_quant_scale * sizeof(QuantSpec)); for (U32 j = 0; j < ptr[i].num_quant_scale; j++) { deserialize_field(pointer, pos, &(ptr[i].weight_scale[j].num_scale)); ptr[i].weight_scale[j].scale = - (F32 *)mt_new_storage(ptr[i].weight_scale[j].num_scale * sizeof(F32)); + (F32 *)mt_malloc(ptr[i].weight_scale[j].num_scale * sizeof(F32)); deserialize_field( pointer, pos, ptr[i].weight_scale[j].scale, ptr[i].weight_scale[j].num_scale); } @@ -351,21 +368,21 @@ EE deserialize_weight(const char *bytes, ModelSpec *spec, U32 *pos) CHECK_REQUIREMENT(length == count); if (quantFP16) { - ptr[i].weight = (U8 *)mt_new_storage(ptr[i].bytes_of_weight); - ptr[i].vec = (U8 *)mt_new_storage(ptr[i].bytes_of_vec); + ptr[i].weight = (U8 *)mt_malloc(ptr[i].bytes_of_weight); + ptr[i].vec = (U8 *)mt_malloc(ptr[i].bytes_of_vec); transformToFloat(DT_F16, serialWeight, (F32 *)ptr[i].weight, ptr[i].bytes_of_weight / 4); transformToFloat(DT_F16, serialBias, (F32 *)ptr[i].vec, ptr[i].bytes_of_vec / 4); } else { if (quantInt8) { CHECK_REQUIREMENT( 1 == ptr[i].num_quant_scale && 1 == ptr[i].weight_scale[0].num_scale); - ptr[i].weight = (U8 *)mt_new_storage(ptr[i].bytes_of_weight); + ptr[i].weight = (U8 *)mt_malloc(ptr[i].bytes_of_weight); F32 scale = ptr[i].weight_scale[0].scale[0]; if (DT_F32 == ptr[i].mdt) { dequantize_int8_weight(ptr[i].bytes_of_weight / 4, scale, (INT8 *)serialWeight, (F32 *)ptr[i].weight); } else if (DT_F16 == ptr[i].mdt) { -#ifdef __aarch64__ +#ifdef _USE_FP16 dequantize_int8_weight(ptr[i].bytes_of_weight / 2, scale, (INT8 *)serialWeight, (F16 *)ptr[i].weight); #else @@ -375,7 +392,7 @@ EE deserialize_weight(const char *bytes, ModelSpec *spec, U32 *pos) } else { UNI_ERROR_LOG( "Can not support convert INT8 data to %s.\n", DataTypeName()[ptr[i].mdt]); - exit(1); + return NOT_SUPPORTED; } } else { ptr[i].weight = serialWeight; @@ -389,28 +406,36 @@ EE deserialize_weight(const char *bytes, ModelSpec *spec, U32 *pos) EE deserialize_model(const char *bytes, ModelSpec *spec) { U32 pos = 0; - CHECK_STATUS(deserialize_header(bytes, spec, &pos)); - CHECK_STATUS(deserialize_operator(bytes, spec, &pos)); - CHECK_STATUS(deserialize_weight(bytes, spec, &pos)); - CHECK_STATUS(operator_relationship(spec)); + EE ret = deserialize_header(bytes, spec, &pos); + if (ret == SUCCESS) { + ret = deserialize_operator(bytes, spec, &pos); + } + if (ret == SUCCESS) { + ret = deserialize_weight(bytes, spec, &pos); + } + if (ret == SUCCESS) { + ret = operator_relationship(spec); + } if (spec->mfd->useFileStream) { spec->mfd->fileLength = pos; } - return SUCCESS; + return ret; } EE deserialize_model_from_file(const char *fn, ModelSpec *spec, bool useFileStream) { UNI_DEBUG_LOG("Read bolt model from %s...\n", (useFileStream ? "file stream" : fn)); + EE ret = NOT_SUPPORTED; UNI_PROFILE( { char *bytes = nullptr; int fd = -1; size_t fileLength; - spec->mfd = (ModelFileDescriptor *)mt_new_storage(sizeof(ModelFileDescriptor)); + spec->mfd = (ModelFileDescriptor *)mt_malloc(sizeof(ModelFileDescriptor)); spec->mfd->useFileStream = useFileStream; if (useFileStream) { bytes = (char *)fn; + ret = SUCCESS; } else { #ifdef _WIN32 FILE *file = fopen(fn, "rb"); @@ -423,7 +448,7 @@ EE deserialize_model_from_file(const char *fn, ModelSpec *spec, bool useFileStre fileLength = ftell(file); rewind(file); - bytes = (char *)malloc(sizeof(char) * fileLength); + bytes = (char *)UNI_MALLOC(sizeof(char) * fileLength); if (bytes == NULL) { UNI_ERROR_LOG("Memory allocated for model failed.\n"); } @@ -459,9 +484,9 @@ EE deserialize_model_from_file(const char *fn, ModelSpec *spec, bool useFileStre } spec->mfd->bytes = bytes; - CHECK_STATUS(deserialize_model(bytes, spec)); + ret = deserialize_model(bytes, spec); }, std::string("deserialize_model_from_file"), std::string("prepare")); UNI_DEBUG_LOG("Read bolt model end.\n"); - return SUCCESS; + return ret; } diff --git a/common/model_spec/src/model_print.cpp b/common/model_spec/src/model_print.cpp index f526b79e..5a3f5654 100644 --- a/common/model_spec/src/model_print.cpp +++ b/common/model_spec/src/model_print.cpp @@ -16,6 +16,7 @@ void print_header(const ModelSpec ms) { +#ifdef _USE_MODEL_PRINT printf("[Model] %s\n [DataType] %s\n [Inputs] %d\n", ms.model_name, DataTypeName()[ms.dt], ms.num_inputs); if (ms.num_inputs > 0) { @@ -32,10 +33,12 @@ void print_header(const ModelSpec ms) for (int i = 0; i < ms.num_outputs; i++) { printf(" %2d %s\n", i, ms.output_names[i]); } +#endif } void print_operator_tensor_relationship(const ModelSpec ms, bool deleteDeprecatedOp) { +#ifdef _USE_MODEL_PRINT int number = ms.num_operator_specs; printf(" [Operators] %d\n", number); if (number > 0) { @@ -72,10 +75,12 @@ void print_operator_tensor_relationship(const ModelSpec ms, bool deleteDeprecate } printf("\n"); } +#endif } void print_weights(const ModelSpec ms) { +#ifdef _USE_MODEL_PRINT std::map vec_data_type; for (int i = 0; i < ms.num_operator_specs; i++) { switch (ms.ops[i].type) { @@ -129,10 +134,12 @@ void print_weights(const ModelSpec ms) } printf("\n"); } +#endif } void print_relationship(const ModelSpec ms) { +#ifdef _USE_MODEL_PRINT int number = ms.num_op_tensor_entries; printf(" [Relationships] %d\n", number); if (number > 0) { @@ -149,6 +156,7 @@ void print_relationship(const ModelSpec ms) } printf("\n"); } +#endif } void print_ms(const ModelSpec ms) diff --git a/common/model_spec/src/model_serialize.cpp b/common/model_spec/src/model_serialize.cpp index 617bc183..136a34ce 100644 --- a/common/model_spec/src/model_serialize.cpp +++ b/common/model_spec/src/model_serialize.cpp @@ -18,14 +18,14 @@ EE serialize_header(const ModelSpec *spec, std::string *tmp) U32 bufSize = sizeof(I32) * 2 + sizeof(I8) * NAME_LEN + sizeof(DataType) + sizeof(I32) + sizeof(I8) * NAME_LEN * spec->num_inputs + sizeof(TensorDesc) * spec->num_inputs + sizeof(I32) + sizeof(I8) * NAME_LEN * spec->num_outputs; - I8 *data = (I8 *)mt_new_storage(bufSize); + I8 *data = (I8 *)mt_malloc(bufSize); I32 *pointer4version = (I32 *)data; - memcpy(pointer4version, &spec->version, sizeof(I32)); + UNI_MEMCPY(pointer4version, &spec->version, sizeof(I32)); pointer4version += 1; I32 *pointer4magicNumber = (I32 *)pointer4version; - memcpy(pointer4magicNumber, &spec->magic_number, sizeof(I32)); + UNI_MEMCPY(pointer4magicNumber, &spec->magic_number, sizeof(I32)); pointer4magicNumber += 1; I8 *pointer4modelName = (I8 *)pointer4magicNumber; @@ -47,7 +47,7 @@ EE serialize_header(const ModelSpec *spec, std::string *tmp) } TensorDesc *pointer4TensorDesc = (TensorDesc *)pointer4InputNames; - memcpy(pointer4TensorDesc, spec->input_dims, sizeof(TensorDesc) * spec->num_inputs); + UNI_MEMCPY(pointer4TensorDesc, spec->input_dims, sizeof(TensorDesc) * spec->num_inputs); pointer4TensorDesc += spec->num_inputs; I32 *pointer4numOutputs = (I32 *)pointer4TensorDesc; @@ -63,7 +63,7 @@ EE serialize_header(const ModelSpec *spec, std::string *tmp) tmp->clear(); CHECK_REQUIREMENT((U32)(pointer4outputNames - data) == bufSize); tmp->assign(data, data + bufSize); - delete data; + mt_free(data); return SUCCESS; } @@ -72,7 +72,8 @@ U32 operator_memory_size(OperatorSpec *ops) // sizeof(U32) * 4 : type + num_inputs + num_output + num_quant_feature U32 allocatedBufferSize = sizeof(I8) * NAME_LEN + sizeof(U32) * 4 + ops->num_inputs * NAME_LEN * sizeof(I8) + ops->num_outputs * NAME_LEN * sizeof(I8) + - (ops->num_inputs + ops->num_outputs) * sizeof(I32) + get_operator_parameter_size(ops->type); + (ops->num_inputs + ops->num_outputs) * sizeof(I32) + + get_operator_parameter_size(sg_boltVersion, ops->type); for (U32 i = 0; i < ops->num_quant_feature; i++) { allocatedBufferSize += sizeof(int); // num_scale @@ -95,7 +96,7 @@ EE serialize_operators(const ModelSpec *spec, std::string *tmp) opsTmp++; } - char *data = (char *)mt_new_storage(bufSize); + char *data = (char *)mt_malloc(bufSize); I32 *pointer4numOperatorSpecs = (I32 *)data; *pointer4numOperatorSpecs = spec->num_operator_specs - removeOpNum; // attention @@ -139,7 +140,7 @@ EE serialize_operators(const ModelSpec *spec, std::string *tmp) I32 *pointer4tensorPos = (I32 *)pointer4opsOutputTensorsName; U32 numTensors = opsPointer[i].num_inputs + opsPointer[i].num_outputs; if (nullptr != opsPointer[i].tensor_positions) { - memcpy(pointer4tensorPos, opsPointer[i].tensor_positions, numTensors * sizeof(I32)); + UNI_MEMCPY(pointer4tensorPos, opsPointer[i].tensor_positions, numTensors * sizeof(I32)); } else { for (U32 j = 0; j < numTensors; j++) { pointer4tensorPos[j] = -1; @@ -156,13 +157,13 @@ EE serialize_operators(const ModelSpec *spec, std::string *tmp) *pointer4quant = opsPointer[i].feature_scale[j].num_scale; int num = *pointer4quant; pointer4quant++; - memcpy(pointer4quant, opsPointer[i].feature_scale[j].scale, num * sizeof(F32)); + UNI_MEMCPY(pointer4quant, opsPointer[i].feature_scale[j].scale, num * sizeof(F32)); pointer4quant += num; } char *pointer4parameterSpecs = (char *)pointer4quant; - int operatorParameterSize = get_operator_parameter_size(opsPointer[i].type); - memcpy(pointer4parameterSpecs, &(opsPointer[i].ps), operatorParameterSize); + int operatorParameterSize = get_operator_parameter_size(sg_boltVersion, opsPointer[i].type); + UNI_MEMCPY(pointer4parameterSpecs, &(opsPointer[i].ps), operatorParameterSize); pointer4parameterSpecs += operatorParameterSize; pointer4opsName = (I8 *)pointer4parameterSpecs; } @@ -170,7 +171,7 @@ EE serialize_operators(const ModelSpec *spec, std::string *tmp) tmp->clear(); CHECK_REQUIREMENT((U32)(pointer4opsName - data) == bufSize); tmp->assign(data, data + bufSize); - delete data; + mt_free(data); return SUCCESS; } @@ -194,7 +195,7 @@ EE serialize_weights(const ModelSpec *spec, std::string *tmp) weightCount++; } - char *data = (char *)mt_new_storage(bufSize); + char *data = (char *)mt_malloc(bufSize); I32 *pointer4numWeightSpecs = (I32 *)data; *pointer4numWeightSpecs = weightCount; @@ -225,7 +226,7 @@ EE serialize_weights(const ModelSpec *spec, std::string *tmp) pointer4wsBytesOfWeight++; U8 *pointer4wsWeight = (U8 *)pointer4wsBytesOfWeight; - memcpy(pointer4wsWeight, wsPointer[i].weight, wsPointer[i].bytes_of_weight); + UNI_MEMCPY(pointer4wsWeight, wsPointer[i].weight, wsPointer[i].bytes_of_weight); pointer4wsWeight += wsPointer[i].bytes_of_weight; U32 *pointer4wsBytesOfVec = (U32 *)pointer4wsWeight; @@ -233,7 +234,7 @@ EE serialize_weights(const ModelSpec *spec, std::string *tmp) pointer4wsBytesOfVec++; U8 *pointer4wsVec = (U8 *)pointer4wsBytesOfVec; - memcpy(pointer4wsVec, wsPointer[i].vec, wsPointer[i].bytes_of_vec); + UNI_MEMCPY(pointer4wsVec, wsPointer[i].vec, wsPointer[i].bytes_of_vec); pointer4wsVec += wsPointer[i].bytes_of_vec; U32 *pointer4numquant = (U32 *)pointer4wsVec; @@ -245,7 +246,7 @@ EE serialize_weights(const ModelSpec *spec, std::string *tmp) *pointer4quant = wsPointer[i].weight_scale[j].num_scale; int num = *pointer4quant; pointer4quant++; - memcpy(pointer4quant, wsPointer[i].weight_scale[j].scale, num * sizeof(F32)); + UNI_MEMCPY(pointer4quant, wsPointer[i].weight_scale[j].scale, num * sizeof(F32)); pointer4quant += num; } @@ -255,7 +256,7 @@ EE serialize_weights(const ModelSpec *spec, std::string *tmp) tmp->clear(); CHECK_REQUIREMENT((U32)(pointer4wsOpName - data) == bufSize); tmp->assign(data, data + bufSize); - delete data; + mt_free(data); return SUCCESS; } @@ -299,8 +300,10 @@ EE serialize_model_to_file(const ModelSpec *spec, const char *fn) { UNI_DEBUG_LOG("Write bolt model to %s...\n", fn); std::string bytes = ""; - CHECK_STATUS(serialize_model(spec, &bytes)); - CHECK_STATUS(write_to_file(&bytes, fn)); + EE ret = serialize_model(spec, &bytes); + if (ret == SUCCESS) { + ret = write_to_file(&bytes, fn); + } UNI_DEBUG_LOG("Write bolt model end.\n"); return SUCCESS; } diff --git a/common/model_spec/src/model_spec.cpp b/common/model_spec/src/model_spec.cpp index 6de15409..0876089c 100644 --- a/common/model_spec/src/model_spec.cpp +++ b/common/model_spec/src/model_spec.cpp @@ -15,7 +15,7 @@ #include #endif -#include "model_spec.h" +#include "model_common.h" EE mt_create_model(ModelSpec *ms) { @@ -49,29 +49,22 @@ EE mt_destroy_model(ModelSpec *ms) if (nullptr != ms->input_names) { for (int i = 0; i < ms->num_inputs; i++) { - if (nullptr != ms->input_names[i]) { - delete ms->input_names[i]; - } - ms->input_names[i] = nullptr; + mt_free(ms->input_names[i]); } - delete ms->input_names; - ms->input_names = nullptr; + ms->num_inputs = 0; + mt_free(ms->input_names); } if (nullptr != ms->input_dims) { - delete ms->input_dims; - ms->input_dims = nullptr; + mt_free(ms->input_dims); } if (nullptr != ms->output_names) { for (int i = 0; i < ms->num_outputs; i++) { - if (nullptr != ms->output_names[i]) { - delete ms->output_names[i]; - } - ms->output_names[i] = nullptr; + mt_free(ms->output_names[i]); } - delete ms->output_names; - ms->output_names = nullptr; + ms->num_outputs = 0; + mt_free(ms->output_names); } if (nullptr != ms->ops) { @@ -79,92 +72,79 @@ EE mt_destroy_model(ModelSpec *ms) for (int i = 0; i < op_num; i++) { if (nullptr != ms->ops[i].input_tensors_name) { for (U32 j = 0; j < ms->ops[i].num_inputs; j++) { - if (nullptr != ms->ops[i].input_tensors_name[j]) { - delete ms->ops[i].input_tensors_name[j]; - } - ms->ops[i].input_tensors_name[j] = nullptr; + mt_free(ms->ops[i].input_tensors_name[j]); } - delete ms->ops[i].input_tensors_name; - ms->ops[i].input_tensors_name = nullptr; + ms->ops[i].num_inputs = 0; + mt_free(ms->ops[i].input_tensors_name); } if (nullptr != ms->ops[i].output_tensors_name) { for (U32 j = 0; j < ms->ops[i].num_outputs; j++) { - if (nullptr != ms->ops[i].output_tensors_name[j]) { - delete ms->ops[i].output_tensors_name[j]; - } - ms->ops[i].output_tensors_name[j] = nullptr; + mt_free(ms->ops[i].output_tensors_name[j]); } - delete ms->ops[i].output_tensors_name; - ms->ops[i].output_tensors_name = nullptr; - } - - if (nullptr != ms->ops[i].tensor_positions) { - delete ms->ops[i].tensor_positions; + ms->ops[i].num_outputs = 0; + mt_free(ms->ops[i].output_tensors_name); } + mt_free(ms->ops[i].tensor_positions); if (0 != ms->ops[i].num_quant_feature && nullptr != ms->ops[i].feature_scale) { for (U32 j = 0; j < ms->ops[i].num_quant_feature; j++) { if (0 != ms->ops[i].feature_scale[j].num_scale) { - if (nullptr != ms->ops[i].feature_scale[j].scale) { - delete ms->ops[i].feature_scale[j].scale; - } + ms->ops[i].feature_scale[j].num_scale = 0; + mt_free(ms->ops[i].feature_scale[j].scale); } } - delete ms->ops[i].feature_scale; + ms->ops[i].num_quant_feature = 0; + mt_free(ms->ops[i].feature_scale); } } - delete ms->ops; - ms->ops = nullptr; + ms->num_operator_specs = 0; + mt_free(ms->ops); } if (nullptr != ms->ws) { - int weightOpNum = ms->num_weight_specs; - for (int i = 0; i < weightOpNum; i++) { - if (nullptr != ms->ws[i].weight && outOfFileMapRange(ms->ws[i].weight, ms->mfd)) { - delete ms->ws[i].weight; - } - ms->ws[i].weight = nullptr; - if (nullptr != ms->ws[i].vec && outOfFileMapRange(ms->ws[i].vec, ms->mfd)) { - delete ms->ws[i].vec; + for (int i = 0; i < ms->num_weight_specs; i++) { + ms->ws[i].bytes_of_weight = 0; + mt_free(ms->ws[i].weight, ms); + ms->ws[i].bytes_of_vec = 0; + mt_free(ms->ws[i].vec, ms); + for (U32 j = 0; j < ms->ws[i].num_quant_scale; j++) { + if (0 != ms->ws[i].weight_scale[j].num_scale) { + ms->ws[i].weight_scale[j].num_scale = 0; + mt_free(ms->ws[i].weight_scale[j].scale); + } } - ms->ws[i].vec = nullptr; + ms->ws[i].num_quant_scale = 0; + mt_free(ms->ws[i].weight_scale); } - delete ms->ws; - ms->ws = nullptr; + ms->num_weight_specs = 0; + mt_free(ms->ws); } if (nullptr != ms->op_relationship_entries) { - int numOpRelationPair = ms->num_op_tensor_entries; - for (int i = 0; i < numOpRelationPair; i++) { + for (int i = 0; i < ms->num_op_tensor_entries; i++) { if (nullptr != ms->op_relationship_entries[i].input_op_names) { for (U32 j = 0; j < ms->op_relationship_entries[i].num_inputs; j++) { - if (nullptr != ms->op_relationship_entries[i].input_op_names[j]) { - delete ms->op_relationship_entries[i].input_op_names[j]; - } - ms->op_relationship_entries[i].input_op_names[j] = nullptr; + mt_free(ms->op_relationship_entries[i].input_op_names[j]); } - delete ms->op_relationship_entries[i].input_op_names; - ms->op_relationship_entries[i].input_op_names = nullptr; + ms->op_relationship_entries[i].num_inputs = 0; + mt_free(ms->op_relationship_entries[i].input_op_names); } if (nullptr != ms->op_relationship_entries[i].output_op_names) { for (U32 j = 0; j < ms->op_relationship_entries[i].num_outputs; j++) { - if (nullptr != ms->op_relationship_entries[i].output_op_names[j]) { - delete ms->op_relationship_entries[i].output_op_names[j]; - } - ms->op_relationship_entries[i].output_op_names[j] = nullptr; + mt_free(ms->op_relationship_entries[i].output_op_names[j]); } - delete ms->op_relationship_entries[i].output_op_names; - ms->op_relationship_entries[i].output_op_names = nullptr; + ms->op_relationship_entries[i].num_outputs = 0; + mt_free(ms->op_relationship_entries[i].output_op_names); } } - delete ms->op_relationship_entries; - ms->op_relationship_entries = nullptr; + ms->num_op_tensor_entries = 0; + mt_free(ms->op_relationship_entries); } if (ms->mfd != nullptr && !ms->mfd->useFileStream && ms->mfd->bytes != nullptr) { #ifdef _WIN32 // use fread to read model file - free(ms->mfd->bytes); + UNI_FREE(ms->mfd->bytes); #else // use mmap to read model file munmap(ms->mfd->bytes, ms->mfd->fileLength); @@ -173,9 +153,6 @@ EE mt_destroy_model(ModelSpec *ms) } #endif } - - delete ms->mfd; - ms->mfd = nullptr; - + mt_free(ms->mfd); return SUCCESS; } diff --git a/common/uni/include/affinity_policy.h b/common/uni/include/affinity_policy.h new file mode 100644 index 00000000..b0f9b85f --- /dev/null +++ b/common/uni/include/affinity_policy.h @@ -0,0 +1,94 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_AFFINITY_POLICY +#define _H_AFFINITY_POLICY + +#include "sys.h" +#ifdef _USE_OPENMP +#include +#define OMP_MAX_NUM_THREADS \ + (getenv("OMP_NUM_THREADS") == NULL ? omp_get_num_procs() : atoi(getenv("OMP_NUM_THREADS"))) +#else +#define OMP_MAX_NUM_THREADS 1 +#endif +extern int OMP_NUM_THREADS; +const int CPU_MAX_NUMBER = 128; + +typedef enum { + AFFINITY_CPU = 0, + AFFINITY_CPU_LOW_POWER = 1, + AFFINITY_CPU_HIGH_PERFORMANCE = 2, + AFFINITY_GPU = 3 +} AffinityPolicy; + +typedef struct CpuStat { + unsigned long idle; + unsigned long total; +} CpuStat; + +typedef struct DeviceInfo { + int cpuNum; + Arch archs[CPU_MAX_NUMBER]; + long freqs[CPU_MAX_NUMBER]; + float occupys[CPU_MAX_NUMBER]; + int cpuids[CPU_MAX_NUMBER]; + CpuStat cpuStats[CPU_MAX_NUMBER]; + + float maxOccupy; + AffinityPolicy affinityPolicy; + Arch schedule; +} DeviceInfo; + +inline const char *const *AffinityPolicyNames() +{ + static const char *const names[] = { + "CPU_AFFINITY", "CPU_AFFINITY_LOW_POWER", "CPU_AFFINITY_HIGH_PERFORMANCE", "GPU"}; + return names; +} + +inline const AffinityPolicy *AffinityPolicies() +{ + static const AffinityPolicy policies[] = { + AFFINITY_CPU, AFFINITY_CPU_LOW_POWER, AFFINITY_CPU_HIGH_PERFORMANCE, AFFINITY_GPU}; + return policies; +} + +inline AffinityPolicy thread_affinity_get_policy_by_name(const char *name) +{ + for (int i = 0; i < 4; i++) { + const char *target = AffinityPolicyNames()[i]; + if (strcmp(target, name) == 0) { + return AffinityPolicies()[i]; + } + } + return AFFINITY_CPU_HIGH_PERFORMANCE; +} + +inline void set_cpu_num_threads(int threadNum) +{ +#ifndef _USE_OPENMP + if (threadNum > 1) { + UNI_WARNING_LOG("this library not support multi-threads parallel, please rebuild with " + "--openmp option.\n"); + } +#endif + if (threadNum < 0) { + threadNum = 1; + } + if (threadNum > OMP_MAX_NUM_THREADS) { + threadNum = OMP_MAX_NUM_THREADS; + } + OMP_NUM_THREADS = threadNum; +} +#endif diff --git a/common/uni/include/algorithm_map.h b/common/uni/include/algorithm_map.h index 22c315e9..5adecb42 100644 --- a/common/uni/include/algorithm_map.h +++ b/common/uni/include/algorithm_map.h @@ -58,9 +58,7 @@ class AlgorithmMap { if (i == 96) { continue; } - char j[8]; - sprintf(j, "%c", i); - charSet.insert(j); + charSet.insert(std::string(1, i)); } std::string name = modelName; diff --git a/common/uni/include/arm_neon_expand.h b/common/uni/include/arm_neon_expand.h index 83580162..0ff739c1 100644 --- a/common/uni/include/arm_neon_expand.h +++ b/common/uni/include/arm_neon_expand.h @@ -344,7 +344,7 @@ inline void vst1q_lane_f16_builtin(__fp16 *address, float16x8_t vec, const int l #endif #ifdef _USE_INT8 -#ifdef __aarch64__ +#ifdef _USE_FP16 inline int32x4_t vdotq_laneq_s32_builtin(int32x4_t c, int8x16_t a, int8x16_t b, const int laneId) { int32x4_t ret; diff --git a/common/uni/include/array_transpose.h b/common/uni/include/array_transpose.h index 52380110..579ef341 100644 --- a/common/uni/include/array_transpose.h +++ b/common/uni/include/array_transpose.h @@ -14,7 +14,8 @@ #ifndef _H_ARRAY_TRANSPOSE #define _H_ARRAY_TRANSPOSE -#include "string.h" +#include "secure_c_wrapper.h" +#include "affinity_policy.h" template static inline void inner_transpose_template(unsigned int tileSize, @@ -26,25 +27,33 @@ static inline void inner_transpose_template(unsigned int tileSize, int inputDimsNum, int outputDimsNum, unsigned int outputSize, - int sizeInnerIndex, - unsigned int *inputLocalIndex) + int sizeInnerIndex) { - for (unsigned int i = 0; i < outputSize; i++) { - unsigned int outputIndex = i; - for (int j = sizeInnerIndex; j < outputDimsNum; j++) { - unsigned int value = outputIndex % outputDims[j]; - outputIndex /= outputDims[j]; - inputLocalIndex[inputDimsNum - 1 - transposeDims[outputDimsNum - 1 - j]] = value; - } - unsigned int inputIndex = 0; - for (int j = inputDimsNum - 1; j > sizeInnerIndex; j--) { - inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1]; - } - inputIndex += inputLocalIndex[sizeInnerIndex]; - if (branch == 0) { - *(output + i) = *(input + inputIndex); - } else { - memcpy(output + i * tileSize, input + inputIndex * tileSize, tileSize); +#ifdef _USE_OPENMP +#pragma omp parallel num_threads(OMP_NUM_THREADS) +#endif + { + std::vector inputLocalIndex(inputDimsNum); +#ifdef _USE_OPENMP +#pragma omp for +#endif + for (unsigned int i = 0; i < outputSize; i++) { + unsigned int outputIndex = i; + for (int j = sizeInnerIndex; j < outputDimsNum; j++) { + unsigned int value = outputIndex % outputDims[j]; + outputIndex /= outputDims[j]; + inputLocalIndex[inputDimsNum - 1 - transposeDims[outputDimsNum - 1 - j]] = value; + } + unsigned int inputIndex = 0; + for (int j = inputDimsNum - 1; j > sizeInnerIndex; j--) { + inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1]; + } + inputIndex += inputLocalIndex[sizeInnerIndex]; + if (branch == 0) { + *(output + i) = *(input + inputIndex); + } else { + UNI_MEMCPY(output + i * tileSize, input + inputIndex * tileSize, tileSize); + } } } } @@ -58,15 +67,6 @@ inline void array_transpose(unsigned int elementSize, int inputDimsNum, int outputDimsNum) { - unsigned int inputSize = 1, outputSize = 1; - for (int i = 0; i < inputDimsNum; i++) { - inputSize *= inputDims[i]; - } - for (int i = 0; i < outputDimsNum; i++) { - outputSize *= outputDims[i]; - } - CHECK_REQUIREMENT(inputSize == outputSize); - unsigned int sizeInner = 1; int sizeInnerIndex = 0; for (int i = outputDimsNum - 1; i >= 0; i--) { @@ -77,23 +77,55 @@ inline void array_transpose(unsigned int elementSize, break; } } + int tileSize = elementSize * sizeInner; + int in = inputDims[inputDimsNum - 1], ihiw = 0, ic = 0; + if (outputDimsNum - sizeInnerIndex == 3 && transposeDims[0] == 0 && transposeDims[1] == 2 && + transposeDims[2] == 1) { + ic = inputDims[inputDimsNum - 2]; + ihiw = inputDims[inputDimsNum - 3]; + } + if (outputDimsNum - sizeInnerIndex == 4 && transposeDims[0] == 0 && transposeDims[1] == 2 && + transposeDims[2] == 3 && transposeDims[3] == 1) { + ic = inputDims[inputDimsNum - 2]; + ihiw = inputDims[inputDimsNum - 3] * inputDims[inputDimsNum - 4]; + } + if (ic > 0 && ihiw > 0 && input != output) { +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (int o = 0; o < in * ihiw; o++) { + int n = o / ihiw; + int hw = o % ihiw; + U8 *dst = (U8 *)output + o * ic * tileSize; + for (int c = 0; c < ic; c++, dst += tileSize) { + const U8 *src = (const U8 *)input + ((n * ic + c) * ihiw + hw) * tileSize; + UNI_MEMCPY(dst, src, tileSize); + } + } + return; + } + + unsigned int inputSize = 1, outputSize = 1; + for (int i = 0; i < inputDimsNum; i++) { + inputSize *= inputDims[i]; + } + for (int i = 0; i < outputDimsNum; i++) { + outputSize *= outputDims[i]; + } + CHECK_REQUIREMENT(inputSize == outputSize); outputSize = outputSize / sizeInner; - std::vector inputLocalIndex(inputDimsNum, 0); const char *inputPtr = (const char *)input; char *outputPtr = (char *)output; if (sizeInner == 1 && elementSize == 4) { inner_transpose_template<0, int>(elementSize, inputDims, (const int *)input, outputDims, - (int *)output, transposeDims, inputDimsNum, outputDimsNum, outputSize, sizeInnerIndex, - inputLocalIndex.data()); + (int *)output, transposeDims, inputDimsNum, outputDimsNum, outputSize, sizeInnerIndex); } else if (sizeInner == 1 && elementSize == 2) { inner_transpose_template<0, short>(elementSize, inputDims, (const short *)input, outputDims, - (short *)output, transposeDims, inputDimsNum, outputDimsNum, outputSize, sizeInnerIndex, - inputLocalIndex.data()); + (short *)output, transposeDims, inputDimsNum, outputDimsNum, outputSize, sizeInnerIndex); } else { - inner_transpose_template<1, char>(sizeInner * elementSize, inputDims, (const char *)input, - outputDims, (char *)output, transposeDims, inputDimsNum, outputDimsNum, outputSize, - sizeInnerIndex, inputLocalIndex.data()); + inner_transpose_template<1, char>(tileSize, inputDims, (const char *)input, outputDims, + (char *)output, transposeDims, inputDimsNum, outputDimsNum, outputSize, sizeInnerIndex); } } @@ -113,22 +145,31 @@ inline void array_transpose_naive(unsigned int elementSize, inputSize *= inputDims[i]; outputSize *= outputDims[i]; } - std::vector inputLocalIndex(dimsNum); const char *inputPtr = (const char *)input; char *outputPtr = (char *)output; - for (unsigned int i = 0; i < outputSize; i++) { - unsigned int outputIndex = i; - for (int j = 0; j < dimsNum; j++) { - unsigned int value = outputIndex % outputDims[j]; - outputIndex /= outputDims[j]; - inputLocalIndex[dimsNum - 1 - transposeDims[dimsNum - 1 - j]] = value; - } - unsigned int inputIndex = 0; - for (int j = dimsNum - 1; j > 0; j--) { - inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1]; +#ifdef _USE_OPENMP +#pragma omp parallel num_threads(OMP_NUM_THREADS) +#endif + { + std::vector inputLocalIndex(dimsNum); +#ifdef _USE_OPENMP +#pragma omp for +#endif + for (unsigned int i = 0; i < outputSize; i++) { + unsigned int outputIndex = i; + for (int j = 0; j < dimsNum; j++) { + unsigned int value = outputIndex % outputDims[j]; + outputIndex /= outputDims[j]; + inputLocalIndex[dimsNum - 1 - transposeDims[dimsNum - 1 - j]] = value; + } + unsigned int inputIndex = 0; + for (int j = dimsNum - 1; j > 0; j--) { + inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1]; + } + inputIndex += inputLocalIndex[0]; + UNI_MEMCPY( + outputPtr + i * elementSize, inputPtr + inputIndex * elementSize, elementSize); } - inputIndex += inputLocalIndex[0]; - memcpy(outputPtr + i * elementSize, inputPtr + inputIndex * elementSize, elementSize); } } #endif diff --git a/common/uni/include/data_type.h b/common/uni/include/data_type.h index 58dbb121..9c152678 100644 --- a/common/uni/include/data_type.h +++ b/common/uni/include/data_type.h @@ -15,9 +15,9 @@ #define _H_DATA_TYPE #include -#include #include -#ifdef __aarch64__ +#include +#ifdef _USE_FP16 #include typedef __fp16 F16; #endif @@ -25,8 +25,9 @@ typedef __fp16 F16; #include #include #define FTZ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); -typedef float F16; #endif +#define _USE_ULTRA_OPTIMIZATION +#include "secure_c_wrapper.h" typedef int8_t INT8; typedef uint8_t UINT8; @@ -56,26 +57,41 @@ typedef enum { DT_BIN11 = 8, DT_F32_8Q = 9, DT_U8_Q = 10, - DT_NUM = 11 + DT_I64 = 11, + DT_U64 = 12, + DT_F64 = 13, + DT_NUM = 14 } DataType; inline const char *const *DataTypeName() { static const char *const names[] = {"DT_U8", "DT_I8", "DT_U32", "DT_I32", "DT_F16", "DT_F16_8Q", - "DT_F32", "DT_BIN01", "DT_BIN11", "DT_F32_8Q", "DT_U8_Q", "DT_NUM"}; + "DT_F32", "DT_BIN01", "DT_BIN11", "DT_F32_8Q", "DT_U8_Q", "DT_I64", "DT_U64", "DT_F64", + "DT_NUM"}; return names; } inline U32 bytesOf(DataType dt) { // Please divide number of elements by 8 first in the case of binary data types - U32 bytes[] = {1, 1, 4, 4, 2, 2, 4, 1, 1, 4, 1}; - return dt < DT_NUM ? bytes[dt] : 0; + U32 bytes[] = {1, 1, 4, 4, 2, 2, 4, 1, 1, 4, 1, 8, 8, 8}; + U32 ret; + if (dt < DT_NUM) { + ret = bytes[dt]; + } else { + ret = 0; + printf("[ERROR] try to get unknown type:%s bytes.\n", DataTypeName()[dt]); + exit(1); + } + return ret; } #ifdef _USE_FP16 inline void transformFromHalf(DataType dataType, const F16 *src, void *dst, int num) { + if (num <= 0) { + return; + } if (num % 8 != 0) { printf("[ERROR] can not support to transformFromHalf for array(length(%d) mod 8 != 0).\n", num); @@ -110,6 +126,9 @@ inline void transformFromHalf(DataType dataType, const F16 *src, void *dst, int inline void transformToHalf(DataType dataType, const void *src, F16 *dst, int num) { + if (num <= 0) { + return; + } if (num % 8 != 0) { printf( "[ERROR] can not support to transformToHalf for array(length(%d) mod 8 != 0).\n", num); @@ -148,12 +167,81 @@ inline void transformToHalf(DataType dataType, const void *src, F16 *dst, int nu } #endif +inline void transformToInt(DataType dataType, const void *src, int *dst, int num) +{ + if (num <= 0) { + return; + } + switch (dataType) { + case DT_I64: { + I64 value; + const U8 *ptr = (const U8 *)src; + for (int i = 0; i < num; i++) { + UNI_MEMCPY(&value, ptr, sizeof(I64)); + ptr += sizeof(I64); + value = value > INT_MAX ? INT_MAX : value; + dst[i] = value < INT_MIN ? INT_MIN : value; + } + break; + } + case DT_U32: + case DT_I32: { + UNI_MEMCPY(dst, src, sizeof(int) * num); + break; + } + default: { + printf("[ERROR] can not transform %s to int.\n", DataTypeName()[dataType]); + exit(1); + } + } +} + +inline unsigned short float32ToFloat16(float value) +{ + const U32 *word = (const U32 *)(&value); + unsigned short sign = (word[0] & 0x80000000) >> 31; + unsigned short exponent = (word[0] & 0x7F800000) >> 23; + unsigned int significand = word[0] & 0x7FFFFF; + + unsigned short u; + if (exponent == 0) { + u = (sign << 15) | (0x00 << 10) | 0x00; + } else if (exponent == 0xFF) { + u = (sign << 15) | (0x1F << 10) | (significand ? 0x200 : 0x00); + } else { + short newexp = exponent + (-127 + 15); + if (newexp >= 31) { + u = (sign << 15) | (0x1F << 10) | 0x00; + } else if (newexp <= 0) { + if (newexp >= -10) { + unsigned short sig = (significand | 0x800000) >> (14 - newexp); + u = (sign << 15) | (0x00 << 10) | sig; + } else { + u = (sign << 15) | (0x00 << 10) | 0x00; + } + } else { + u = (sign << 15) | (newexp << 10) | (significand >> 13); + } + } + return u; +} + inline void transformFromFloat( DataType dataType, const float *src, void *dst, int num, float scale = 1) { + if (num <= 0) { + return; + } switch (dataType) { case DT_F32: { - memcpy(dst, src, sizeof(float) * num); + UNI_MEMCPY(dst, src, sizeof(float) * num); + break; + } + case DT_I64: { + I64 *ptr = (I64 *)dst; + for (int i = 0; i < num; i++) { + ptr[i] = src[i]; + } break; } case DT_U32: { @@ -172,41 +260,16 @@ inline void transformFromFloat( } case DT_F16_8Q: case DT_F16: { -#ifdef __aarch64__ +#ifdef _USE_FP16 F16 *ptr = (F16 *)dst; #else - const U32 *word = (const U32 *)src; unsigned short *q = (unsigned short *)dst; #endif for (int i = 0; i < num; i++) { -#ifdef __aarch64__ +#ifdef _USE_FP16 ptr[i] = src[i]; #else - unsigned short sign = (word[i] & 0x80000000) >> 31; - unsigned short exponent = (word[i] & 0x7F800000) >> 23; - unsigned int significand = word[i] & 0x7FFFFF; - - unsigned short u; - if (exponent == 0) { - u = (sign << 15) | (0x00 << 10) | 0x00; - } else if (exponent == 0xFF) { - u = (sign << 15) | (0x1F << 10) | (significand ? 0x200 : 0x00); - } else { - short newexp = exponent + (-127 + 15); - if (newexp >= 31) { - u = (sign << 15) | (0x1F << 10) | 0x00; - } else if (newexp <= 0) { - if (newexp >= -10) { - unsigned short sig = (significand | 0x800000) >> (14 - newexp); - u = (sign << 15) | (0x00 << 10) | sig; - } else { - u = (sign << 15) | (0x00 << 10) | 0x00; - } - } else { - u = (sign << 15) | (newexp << 10) | (significand >> 13); - } - } - q[i] = u; + q[i] = float32ToFloat16(src[i]); #endif } break; @@ -235,10 +298,20 @@ inline void transformFromFloat( inline void transformToFloat( DataType dataType, const void *src, float *dst, int num, float scale = 1) { + if (num <= 0) { + return; + } switch (dataType) { case DT_F32_8Q: case DT_F32: { - memcpy(dst, src, sizeof(float) * num); + UNI_MEMCPY(dst, src, sizeof(float) * num); + break; + } + case DT_I64: { + const I64 *ptr = (const I64 *)src; + for (int i = 0; i < num; i++) { + dst[i] = ptr[i]; + } break; } case DT_U32: { @@ -257,14 +330,14 @@ inline void transformToFloat( } case DT_F16_8Q: case DT_F16: { -#ifdef __aarch64__ +#ifdef _USE_FP16 const F16 *ptr = (const F16 *)src; #else const unsigned short *q = (const unsigned short *)src; U32 *word = (U32 *)dst; #endif for (int i = 0; i < num; i++) { -#ifdef __aarch64__ +#ifdef _USE_FP16 dst[i] = ptr[i]; #else unsigned short value = q[i]; @@ -350,13 +423,19 @@ inline void transformToFloat( inline void UNI_INIT(U32 num, DataType dt, F32 val, void *dst) { + if (num <= 0) { + return; + } + if (val == 0) { + UNI_MEMSET(dst, 0, bytesOf(dt) * num); + return; + } switch (dt) { case DT_F16: { - unsigned int short mem; - transformFromFloat(DT_F16, &val, &mem, 1); - U8 *arr = (U8 *)dst; + unsigned short mem = float32ToFloat16(val); + unsigned short *arr = (unsigned short *)dst; for (U32 i = 0; i < num; i++) { - memcpy(arr + i * bytesOf(DT_F16), &mem, bytesOf(DT_F16)); + arr[i] = mem; } break; } diff --git a/common/uni/include/error.h b/common/uni/include/error.h index 00af4c70..e35e2227 100644 --- a/common/uni/include/error.h +++ b/common/uni/include/error.h @@ -19,7 +19,12 @@ #include #ifdef _WIN32 +#ifdef _USE_JNI #define UNI_THREADID int tid = 0; +#else +#include +#define UNI_THREADID int tid = GetThreadId(GetCurrentThread()); +#endif #elif defined(__GLIBC__) || defined(__linux__) #include #define UNI_THREADID pid_t tid = syscall(SYS_gettid); @@ -80,23 +85,23 @@ extern "C" { }) \ } -#define UNI_WARNING_LOG(...) \ - { \ - UNI_THREADID \ - UNI_THREAD_SAFE({ \ - UNI_LOGD("[WARNING] thread %d file %s line %d ", tid, __FILE__, __LINE__); \ - UNI_LOGD(__VA_ARGS__); \ - }) \ +#define UNI_WARNING_LOG(...) \ + { \ + UNI_THREADID \ + UNI_THREAD_SAFE({ \ + UNI_LOGD("[WARNING] thread %d file %s line %d: ", tid, __FILE__, __LINE__); \ + UNI_LOGD(__VA_ARGS__); \ + }) \ } -#define UNI_ERROR_LOG(...) \ - { \ - UNI_THREADID \ - UNI_THREAD_SAFE({ \ - UNI_LOGD("[ERROR] thread %d file %s line %d ", tid, __FILE__, __LINE__); \ - UNI_LOGD(__VA_ARGS__); \ - }) \ - UNI_EXIT; \ +#define UNI_ERROR_LOG(...) \ + { \ + UNI_THREADID \ + UNI_THREAD_SAFE({ \ + UNI_LOGD("[ERROR] thread %d file %s line %d: ", tid, __FILE__, __LINE__); \ + UNI_LOGD(__VA_ARGS__); \ + }) \ + UNI_EXIT; \ } #ifdef _DEBUG diff --git a/common/uni/include/memory_cpu.h b/common/uni/include/memory_cpu.h new file mode 100644 index 00000000..271f9156 --- /dev/null +++ b/common/uni/include/memory_cpu.h @@ -0,0 +1,123 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_UNI_MEMORY_CPU +#define _H_UNI_MEMORY_CPU + +#include "secure_c_wrapper.h" +#include +#ifdef _USE_MEM_CHECK +#include +extern std::map mem_statistics; +#endif + +inline std::string ptr2Str(const void *p) +{ + char b[64]; +#ifdef _USE_SECURE_C + sprintf_s(b, 64, "%p", p); +#else + sprintf(b, "%p", p); +#endif + return std::string(b); +} + +inline void *UNI_MALLOC(unsigned int size) +{ + void *p = nullptr; + if (size > 0) { + p = malloc(size); + if (p == nullptr) { + UNI_ERROR_LOG("cpu malloc ptr:%p bytes:%u error.\n", p, size); + } +#ifdef _USE_MEM_CHECK + UNI_DEBUG_LOG("cpu malloc ptr:%p bytes:%u.\n", p, size); + std::string key = ptr2Str(p) + std::string("(alloc by malloc)"); + mem_statistics[key] = size; +#endif + } + return p; +} + +inline void UNI_FREE(void *p) +{ + if (p == nullptr) { + return; + } +#ifdef _USE_MEM_CHECK + UNI_DEBUG_LOG("cpu free ptr:%p.\n", p); + std::string key = ptr2Str(p) + std::string("(alloc by malloc)"); + if (mem_statistics.find(key) == mem_statistics.end()) { + UNI_ERROR_LOG("try to free unalloc ptr:%p.\n", p); + } else { + mem_statistics.erase(key); + } +#endif + free(p); +} + +inline void *UNI_OPERATOR_NEW(unsigned int size) +{ + void *p = nullptr; + if (size > 0) { + try { + p = operator new(size); + } catch (const std::bad_alloc &e) { + UNI_ERROR_LOG("cpu operator new ptr:%p bytes:%u error.\n", p, size); + } +#ifdef _USE_MEM_CHECK + UNI_DEBUG_LOG("cpu operator new ptr:%p bytes:%u.\n", p, size); + std::string key = ptr2Str(p) + std::string("(alloc by operator new)"); + mem_statistics[key] = size; +#endif + } + return p; +} + +inline void UNI_OPERATOR_DELETE(void *p) +{ + if (p == nullptr) { + return; + } +#ifdef _USE_MEM_CHECK + UNI_DEBUG_LOG("cpu operator delete ptr:%p.\n", p); + std::string key = ptr2Str(p) + std::string("(alloc by operator new)"); + if (mem_statistics.find(key) == mem_statistics.end()) { + UNI_ERROR_LOG("try to operator delete unalloc ptr:%p.\n", p); + } else { + mem_statistics.erase(key); + } +#endif + operator delete(p); +} + +inline size_t UNI_MEM_SIZE() +{ + size_t size = 0; +#ifdef _USE_MEM_CHECK + for (auto iter : mem_statistics) { + size += iter.second; + } +#endif + return size; +} + +inline void UNI_MEM_STATISTICS() +{ +#ifdef _USE_MEM_CHECK + for (auto iter : mem_statistics) { + UNI_ERROR_LOG("ptr:%s bytes:%u is not free.\n", iter.first.c_str(), iter.second); + } +#endif +} +#endif diff --git a/common/uni/include/operator_type.h b/common/uni/include/operator_type.h index 455a25f8..72e8775d 100644 --- a/common/uni/include/operator_type.h +++ b/common/uni/include/operator_type.h @@ -92,14 +92,14 @@ typedef enum { OT_SoftPlus = 69, OT_Exp = 70, - OT_Split = 71, + OT_OneHot = 71, OT_Tdnn = 72, OT_Dropout = 73, OT_TopK = 74, OT_SpaceToBatchNd = 75, OT_BatchToSpaceNd = 76, OT_Abs = 77, - OT_Equal = 78, + OT_NonZero = 78, OT_Sign = 79, OT_HSwishNoDiv = 80, @@ -113,7 +113,18 @@ typedef enum { OT_GenerateProposals = 88, OT_RoIAlign = 89, - OT_GAT = 90 + OT_GAT = 90, + OT_QuantizeLinear = 91, + OT_Round = 92, + OT_Floor = 93, + OT_Ceil = 94, + OT_RandomUniform = 95, + OT_CumSum = 96, + OT_GridSample = 97, + OT_NonMaxSuppression = 98, + OT_Range = 99, + + OT_Swish = 100 } OperatorType; inline const char *const *OperatorTypeName() @@ -140,13 +151,16 @@ inline const char *const *OperatorTypeName() "OT_DetectionOutput", "OT_Yolov3DetectionOutput", "OT_MultiHeadAttention", "OT_SqDiff", "OT_Tile", "OT_Splice", "OT_Neg", "OT_Greater", "OT_Where", "OT_SoftPlus", "OT_Exp", - "OT_Split", "OT_Tdnn", "OT_Dropout", "OT_TopK", "OT_SpaceToBatchNd", "OT_BatchToSpaceNd", - "OT_Abs", "OT_Equal", "OT_Sign", "OT_HSwishNoDiv", + "OT_OneHot", "OT_Tdnn", "OT_Dropout", "OT_TopK", "OT_SpaceToBatchNd", "OT_BatchToSpaceNd", + "OT_Abs", "OT_NonZero", "OT_Sign", "OT_HSwishNoDiv", "OT_InstanceNorm", "OT_Expand", "OT_Scatter", "OT_Select", "OT_Not", "OT_Reciprocal", "OT_Log", "OT_GenerateProposals", "OT_RoIAlign", - "OT_GAT"}; + "OT_GAT", "OT_QuantizeLinear", "OT_Round", "OT_Floor", "OT_Ceil", "OT_RandomUniform", + "OT_CumSum", "OT_GridSample", "OT_NonMaxSuppression", "OT_Range", + + "OT_Swish"}; return names; } #endif diff --git a/common/uni/include/parameter_spec.h b/common/uni/include/parameter_spec.h index 6ec321c2..a3cf1296 100644 --- a/common/uni/include/parameter_spec.h +++ b/common/uni/include/parameter_spec.h @@ -22,21 +22,24 @@ typedef enum { POOLING_MAX, POOLING_MEAN } PoolingMode; -typedef enum { CEIL, FLOOR, TF_SAME, TF_VALID, ROUND_PREFER_FLOOR, ROUND_PREFER_CEIL } RoundMode; - -typedef enum { LINEAR, NEAREST, CUBIC } ResizeMode; - typedef enum { - ROIALIGN_HALF_PIXEL, - ROIALIGN_OUTPUT_HALF_PIXEL -} ROIAlignCoordinateTransformationMode; + ROUND_CEIL, + ROUND_FLOOR, + ROUND_TF_SAME, + ROUND_TF_VALID, + ROUND_PREFER_FLOOR, + ROUND_PREFER_CEIL +} RoundMode; + +typedef enum { RESIZE_LINEAR, RESIZE_NEAREST, RESIZE_CUBIC } ResizeMode; typedef enum { - ALIGN_CORNERS, - HALF_PIXEL, - PYTORCH_HALF_PIXEL, - ASYMMETRIC -} ResizeCoordinateTransMode; + COORDINATE_TRANS_ALIGN_CORNERS, + COORDINATE_TRANS_HALF_PIXEL, + COORDINATE_TRANS_PYTORCH_HALF_PIXEL, + COORDINATE_TRANS_ASYMMETRIC, + COORDINATE_TRANS_OUTPUT_HALF_PIXEL +} CoordinateTransMode; typedef enum { ELTWISE_SUM, @@ -47,7 +50,6 @@ typedef enum { ELTWISE_DIV, ELTWISE_SQRT, ELTWISE_ERF, - ELTWISE_AND, ELTWISE_OR, ELTWISE_XOR @@ -71,23 +73,35 @@ typedef enum { ACTIVATION_H_SWISH_NODIV, ACTIVATION_LOG, ACTIVATION_NOT, - ACTIVATION_NEG + ACTIVATION_NEG, + ACTIVATION_ROUND, + ACTIVATION_FLOOR, + ACTIVATION_CEIL, + ACTIVATION_SWISH, + ACTIVATION_RECIPROCAL } ActivationMode; -typedef enum { BSliceApply_NULL, BSliceApply_CONV } BilateralSliceApplyMode; +typedef enum { BSLICE_APPLY_NULL, BSLICE_APPLY_CONV } BilateralSliceApplyMode; typedef enum { - Convolution_Pointwise, - Convolution_Dilation, - Convolution_Depthwise, - Convolution_Depthwise_Pointwise, - Convolution_Deconvolution, - Convolution_Depthwise_Deconvolution + CONVOLUTION_POINTWISE, + CONVOLUTION_DILATION, + CONVOLUTION_DEPTHWISE, + CONVOLUTION_DEPTHWISE_POINTWISE, + CONVOLUTION_DECONVOLUTION, + CONVOLUTION_DEPTHWISE_DECONVOLUTION } ConvolutionMode; -typedef enum { Pad_Constant, Pad_Reflect, Pad_Edge, Pad_Symmetric } PadMode; +typedef enum { PAD_CONSTANT, PAD_REFLECT, PAD_EDGE, PAD_SYMMETRIC } PadMode; -typedef enum { CHECK_EQUAL, CHECK_GREATEQUAL, CHECK_GREAT } CheckMode; +typedef enum { + CHECK_EQUAL, + CHECK_GREATER_EQUAL, + CHECK_GREATER, + CHECK_LESS, + CHECK_LESS_EQUAL, + CHECK_NOT_EQUAL +} CheckMode; typedef enum { REDUCTION_SUM, @@ -112,16 +126,6 @@ typedef enum { BGR_SC_RAW = 5 } ImageFormat; -#pragma pack(8) -typedef struct ActivationParamSpec { - ActivationMode mode; - float value[4] = {0, 0, 0, 0}; -} ActivationParamSpec; - -typedef struct { - bool propagate_down; -} PReLUParamSpec; - typedef enum { CONVOLUTION_NO_TMP_MEM, CONVOLUTION_FASTEST, @@ -137,30 +141,11 @@ typedef enum { CONVOLUTION_ALGORITHM_GEMM_ICNCHW, CONVOLUTION_ALGORITHM_WINOGRAD, CONVOLUTION_ALGORITHM_BNN, - CONVOLUTION_ALGORITHM_DIRECT_SPE_CK, + CONVOLUTION_ALGORITHM_INVGEMM, CONVOLUTION_ALGORITHM_GROUP_DECONV, CONVOLUTION_ALGORITHM_NULL } ConvolutionForwardAlgorithm; -typedef struct { - float xmin; - float ymin; - float xmax; - float ymax; - unsigned int label; -} BoxRect; - -typedef struct { - unsigned int label; - I64 box_index; -} BoxInfo; - -typedef struct { - unsigned int max_output_boxes_per_class; - float iou_threshold; - float score_threshold; -} NonMaxSuppressionParamSpec; - typedef enum { DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT, @@ -170,28 +155,47 @@ typedef enum { DEPTHWISE_CONVOLUTION_ALGORITHM_NULL } DepthwiseConvolutionForwardAlgorithm; +#pragma pack(8) +typedef struct ActivationParamSpec { + ActivationMode mode; + float value[4] = {0, 0, 0, 0}; +} ActivationParamSpec; + +typedef struct { + bool propagate_down; +} PReLUParamSpec; + +typedef struct { + int center_point_box; + unsigned int max_output_boxes_per_class; + float iou_threshold; + float score_threshold; +} NonMaxSuppressionParamSpec; + typedef struct { + // save h, w unsigned int sizes[2]; + // save n, c, h, w float scales[4]; unsigned int num_sizes; unsigned int num_scales; ResizeMode mode; - ResizeCoordinateTransMode trans_mode; + CoordinateTransMode trans_mode; RoundMode round_mode; } ResizeParamSpec; typedef struct { int axes[8]; - int axes_num; + int num_axes; } SqueezeParamSpec; typedef struct { int axes[8]; - int axes_num; + int num_axes; } UnsqueezeParamSpec; typedef struct { - DataType targetDt; + DataType dt; } CastParamSpec; typedef struct { @@ -204,8 +208,8 @@ typedef struct { } ReLUParamSpec; typedef struct { - float coeff_values[8]; - int coeff_size; + float coeff[8]; + int num_coeff; } EltwiseSumSpec; typedef struct { @@ -219,8 +223,8 @@ typedef union { } ActivationSpec; typedef struct { - EltwiseMode elt_mode; - EltwiseSumSpec elt_sum_spec; + EltwiseMode mode; + EltwiseSumSpec sum_spec; ActivationMode activation_type; ActivationSpec activation_spec; } EltwiseParamSpec; @@ -233,12 +237,12 @@ typedef struct { unsigned int stride_t; unsigned int stride_h; unsigned int stride_w; - unsigned int padding_before; - unsigned int padding_after; - unsigned int padding_top; - unsigned int padding_bottom; - unsigned int padding_left; - unsigned int padding_right; + unsigned int pad_before; + unsigned int pad_after; + unsigned int pad_top; + unsigned int pad_bottom; + unsigned int pad_left; + unsigned int pad_right; unsigned int group; unsigned int dilatedRate_t; unsigned int dilatedRate_h; @@ -248,7 +252,10 @@ typedef struct { ActivationMode dw_activation_type; ActivationMode pw_activation_type; ActivationSpec activation_spec; - RoundMode rm; + RoundMode round_mode; + unsigned int output_pad_t; + unsigned int output_pad_h; + unsigned int output_pad_w; } ConvolutionParamSpec; typedef struct { @@ -258,14 +265,15 @@ typedef struct { unsigned int stride_t; unsigned int stride_h; unsigned int stride_w; - unsigned int padding_before; - unsigned int padding_after; - unsigned int padding_top; - unsigned int padding_bottom; - unsigned int padding_left; - unsigned int padding_right; - RoundMode rm; + unsigned int pad_before; + unsigned int pad_after; + unsigned int pad_top; + unsigned int pad_bottom; + unsigned int pad_left; + unsigned int pad_right; + RoundMode round_mode; PoolingMode mode; + bool count_include_pad; } PoolingParamSpec; // FC's weight is reordered to NxK, K is removed dimension. @@ -307,8 +315,8 @@ typedef struct { } PadParamSpec; typedef struct { - unsigned int input_dim; - unsigned int num_output; + unsigned int num_inputs; + unsigned int num_outputs; bool bias_term; bool transpose; int axis; @@ -321,21 +329,22 @@ typedef struct { } PowerParamSpec; typedef struct { - int shape_dims[8]; - int shape_size; + int shape[8]; + int num_shape; int axis; int num_axes; } ReshapeParamSpec; typedef struct { int slice_points[8]; - unsigned int slice_size; + unsigned int num_slice; int axis; } SliceParamSpec; -typedef struct { - unsigned int trans_dims[8]; - unsigned int trans_size; +typedef struct TransposeParamSpec { + unsigned int axes[8]; + unsigned int num_axes; + DataFormat df = DF_NCHW; } TransposeParamSpec; typedef struct { @@ -346,29 +355,29 @@ typedef struct { typedef struct { RNNMode mode; - unsigned int numOutput; + unsigned int num_outputs; // steps >= 0 for multi-steps RNN // steps = -1 for RNNCell int steps; - int numProjection; - float zoneoutCell; - float zoneoutOutput; + int num_projection; + float zoneout_cell; + float zoneout_output; - bool biDirection; - float forgetBias; - ActivationMode activationMode; + bool bi_direction; + float forget_bias; + ActivationMode activation_type; } RNNParamSpec; typedef struct { - unsigned int coefficient_len; + unsigned int coefficient; BilateralSliceApplyMode mode; bool has_offset; } BilateralSliceApplyParamSpec; typedef struct { int axes[8]; - int axes_num; - ReductionMode reduction_mode; + int num_axes; + ReductionMode mode; float coeff; bool keep_dim; } ReductionParamSpec; @@ -384,7 +393,7 @@ typedef struct { } CopyParamSpec; typedef struct { - CheckMode check_mode; + CheckMode mode; } CheckParamSpec; typedef struct { @@ -392,8 +401,9 @@ typedef struct { int axis; } RepeatParamSpec; -typedef struct { +typedef struct PreAllocatedMemoryParamSpec { TensorDesc desc; + float value = 0; } PreAllocatedMemoryParamSpec; typedef struct { @@ -434,7 +444,7 @@ typedef struct { char ellipsis_mask[8]; char new_axis_mask[8]; char shrink_axis_mask[8]; - unsigned int dim_size; + unsigned int num_dims; } TfSliceParamSpec; typedef struct { @@ -478,17 +488,17 @@ typedef struct { } ChannelResizeParamSpec; typedef struct { - int blockSize; + int block_size; } Space2DepthParamSpec; typedef struct { - int blockSize; - I8 reMode[8]; + int block_size; + I8 mode[8]; } Depth2SpaceParamSpec; typedef struct { - int repeatsInfo[8]; - int dimsSize; + int repeats[8]; + int num_repeats; int axis; } TileParamSpec; @@ -511,26 +521,21 @@ typedef struct { FullyConnectedParamSpec fc_desc[6]; PowerParamSpec power_spec; bool eltwiseWithLayerNormIn[2]; - ActivationMode actiMode; + ActivationMode activation_type; ReshapeParamSpec reshapeDesc[4]; EltwiseParamSpec eltwiseDesc[2]; -} MultiheadAttentionParamSpec; +} MultiHeadAttentionParamSpec; typedef struct { int axis; int largest; int sorted; - int topk; + int k; } TopKParamSpec; typedef struct { - TensorDesc conditionDesc; - TensorDesc yDesc; -} WhereParamSpec; - -typedef struct { - int shape_dims[8]; - int shape_size; + int shape[8]; + int num_shape; } ExpandParamSpec; typedef struct ScatterParamSpec { @@ -558,17 +563,13 @@ typedef struct GatherParamSpec { int batch_dims = 0; } GatherParamSpec; -typedef struct EqualParamSpec { - bool invert = false; -} EqualParamSpec; - typedef struct { unsigned int num_heads; - ActivationParamSpec activation; + ActivationParamSpec activation_type; } GATParamSpec; typedef struct RoIAlignParamSpec { - ROIAlignCoordinateTransformationMode coordinateTransformationMode; + CoordinateTransMode trans_mode; PoolingMode mode; unsigned int output_h; unsigned int output_w; @@ -589,6 +590,57 @@ typedef struct GenerateProposalsParamSpec { float spatial_scale; } GenerateProposalsParamSpec; +typedef struct QuantizeLinearParamSpec { + // get the scales from input tensor + int axis; + DataType dt; +} QuantizeLinearParamSpec; + +typedef struct { + int axis; + float eps; +} LayerNormParamSpec; + +typedef struct RandomUniformParamSpec { + DataType dt; + float low; + float high; + float seed; + int shape[8]; + int num_shape; +} RandomUniformParamSpec; + +typedef struct CumSumParamSpec { + bool exclusive; + bool reverse; + bool axis; +} CumSumParamSpec; + +typedef struct GridSampleParamSpec { + ResizeMode mode; + PadMode pad_mode; + float constant_value = 0; + bool align_corners; +} GridSampleParamSpec; + +typedef struct OneHotParamSpec { + int axis; + int depth; + float values[2]; +} OneHotParamSpec; + +typedef struct ConstantOfShapeParamSpec { + DataType dt; + float value = 0; +} ConstantOfShapeParamSpec; + +typedef struct RangeParamSpec { + DataType dt; + float start; + float limit; + float delta; +} RangeParamSpec; + typedef union ParameterSpec { ParameterSpec() {} @@ -634,18 +686,25 @@ typedef union ParameterSpec { PriorBoxParamSpec prior_box_spec; DetectionOutputParamSpec detection_output_spec; Yolov3DetectionOutputParamSpec yolov3_detection_output_spec; - MultiheadAttentionParamSpec multiheadAttention_spec; + MultiHeadAttentionParamSpec multihead_attention_spec; TileParamSpec tile_spec; SpliceParamSpec splice_spec; TdnnParamSpec tdnn_spec; TopKParamSpec topk_spec; - WhereParamSpec where_spec; ExpandParamSpec expand_spec; ScatterParamSpec scatter_spec; - EqualParamSpec equal_spec; RoIAlignParamSpec roialign_spec; GenerateProposalsParamSpec generate_proposals_spec; GATParamSpec gat_spec; + QuantizeLinearParamSpec quant_spec; + LayerNormParamSpec ln_spec; + RandomUniformParamSpec random_uniform_spec; + CumSumParamSpec cumsum_spec; + GridSampleParamSpec grid_sample_spec; + OneHotParamSpec onehot_spec; + NonMaxSuppressionParamSpec non_max_suppression_spec; + ConstantOfShapeParamSpec constant_of_shape_spec; + RangeParamSpec range_spec; } ParameterSpec; typedef struct { @@ -654,7 +713,7 @@ typedef struct { } QuantSpec; #pragma pack() -inline int get_operator_parameter_size(OperatorType operatorType) +inline int get_operator_parameter_size(int version, OperatorType operatorType) { std::map operatorParameterSizeMap = {{OT_Conv, sizeof(ConvolutionParamSpec)}, {OT_Deconvolution, sizeof(ConvolutionParamSpec)}, {OT_FC, sizeof(FullyConnectedParamSpec)}, @@ -683,20 +742,41 @@ inline int get_operator_parameter_size(OperatorType operatorType) {OT_RelativeShift, sizeof(RelativeShiftParamSpec)}, {OT_PriorBox, sizeof(PriorBoxParamSpec)}, {OT_DetectionOutput, sizeof(DetectionOutputParamSpec)}, {OT_Yolov3DetectionOutput, sizeof(Yolov3DetectionOutputParamSpec)}, - {OT_MultiHeadAttention, sizeof(MultiheadAttentionParamSpec)}, + {OT_MultiHeadAttention, sizeof(MultiHeadAttentionParamSpec)}, {OT_Tile, sizeof(TileParamSpec)}, {OT_Splice, sizeof(SpliceParamSpec)}, {OT_Tdnn, sizeof(TdnnParamSpec)}, {OT_TopK, sizeof(TopKParamSpec)}, - {OT_Where, sizeof(WhereParamSpec)}, {OT_Expand, sizeof(ExpandParamSpec)}, - {OT_InstanceNorm, sizeof(InstanceNormParamSpec)}, {OT_Scatter, sizeof(ScatterParamSpec)}, - {OT_LogSoftmax, sizeof(SoftmaxParamSpec)}, {OT_Equal, sizeof(EqualParamSpec)}, + {OT_Expand, sizeof(ExpandParamSpec)}, {OT_InstanceNorm, sizeof(InstanceNormParamSpec)}, + {OT_Scatter, sizeof(ScatterParamSpec)}, {OT_LogSoftmax, sizeof(SoftmaxParamSpec)}, {OT_GenerateProposals, sizeof(GenerateProposalsParamSpec)}, - {OT_RoIAlign, sizeof(RoIAlignParamSpec)}, {OT_GAT, sizeof(GATParamSpec)}}; + {OT_RoIAlign, sizeof(RoIAlignParamSpec)}, {OT_GAT, sizeof(GATParamSpec)}, + {OT_QuantizeLinear, sizeof(QuantizeLinearParamSpec)}, + {OT_LayerNorm, sizeof(LayerNormParamSpec)}, + {OT_QuantizeLinear, sizeof(QuantizeLinearParamSpec)}, {OT_CumSum, sizeof(CumSumParamSpec)}, + {OT_RandomUniform, sizeof(RandomUniformParamSpec)}, + {OT_GridSample, sizeof(GridSampleParamSpec)}, {OT_OneHot, sizeof(OneHotParamSpec)}, + {OT_NonMaxSuppression, sizeof(NonMaxSuppressionParamSpec)}, + {OT_Range, sizeof(RangeParamSpec)}, {OT_ConstantOfShape, sizeof(ConstantOfShapeParamSpec)}}; int size; if (operatorParameterSizeMap.find(operatorType) == operatorParameterSizeMap.end()) { size = 0; } else { size = operatorParameterSizeMap[operatorType]; } + if (version == 20201120) { + if (operatorType == OT_Conv || operatorType == OT_Deconvolution) { + size -= 3 * sizeof(unsigned int); + } + if (operatorType == OT_LayerNorm) { + size = 0; + } + } else { + size = (size + 3) / 4 * 4; + } + if (version == 20201120 || version == 20211021) { + if (operatorType == OT_Transpose) { + size -= sizeof(DataFormat); + } + } return size; } @@ -707,12 +787,12 @@ inline ConvolutionParamSpec createConvolutionParamSpec(unsigned int group, unsigned int stride_t, unsigned int stride_h, unsigned int stride_w, - unsigned int padding_before, - unsigned int padding_after, - unsigned int padding_top, - unsigned int padding_bottom, - unsigned int padding_left, - unsigned int padding_right, + unsigned int pad_before, + unsigned int pad_after, + unsigned int pad_top, + unsigned int pad_bottom, + unsigned int pad_left, + unsigned int pad_right, unsigned int dilateRate_t, unsigned int dilateRate_h, unsigned int dilateRate_w, @@ -727,17 +807,20 @@ inline ConvolutionParamSpec createConvolutionParamSpec(unsigned int group, p.stride_t = stride_t; p.stride_h = stride_h; p.stride_w = stride_w; - p.padding_before = padding_before; - p.padding_after = padding_after; - p.padding_top = padding_top; - p.padding_bottom = padding_bottom; - p.padding_left = padding_left; - p.padding_right = padding_right; + p.pad_before = pad_before; + p.pad_after = pad_after; + p.pad_top = pad_top; + p.pad_bottom = pad_bottom; + p.pad_left = pad_left; + p.pad_right = pad_right; p.dilatedRate_t = dilateRate_t; p.dilatedRate_h = dilateRate_h; p.dilatedRate_w = dilateRate_w; p.num_outputs = num_outputs; p.convolution_type = convMode; + p.output_pad_t = 0; + p.output_pad_h = 0; + p.output_pad_w = 0; return p; } @@ -762,13 +845,13 @@ inline PoolingParamSpec createPoolingParamSpec(PoolingMode pm, unsigned int stride_t, unsigned int stride_h, unsigned int stride_w, - unsigned int padding_before, - unsigned int padding_after, - unsigned int padding_top, - unsigned int padding_bottom, - unsigned int padding_left, - unsigned int padding_right, - RoundMode rm) + unsigned int pad_before, + unsigned int pad_after, + unsigned int pad_top, + unsigned int pad_bottom, + unsigned int pad_left, + unsigned int pad_right, + RoundMode round_mode) { PoolingParamSpec p; p.mode = pm; @@ -778,26 +861,25 @@ inline PoolingParamSpec createPoolingParamSpec(PoolingMode pm, p.stride_t = stride_t; p.stride_h = stride_h; p.stride_w = stride_w; - p.padding_before = padding_before; - p.padding_after = padding_after; - p.padding_top = padding_top; - p.padding_bottom = padding_bottom; - p.padding_left = padding_left; - p.padding_right = padding_right; - p.rm = rm; + p.pad_before = pad_before; + p.pad_after = pad_after; + p.pad_top = pad_top; + p.pad_bottom = pad_bottom; + p.pad_left = pad_left; + p.pad_right = pad_right; + p.round_mode = round_mode; return p; } -inline ReshapeParamSpec createReshapeParamSpec( - int *shape_dims, int shape_size, int axis, int num_axes) +inline ReshapeParamSpec createReshapeParamSpec(int *shape, int num_shape, int axis, int num_axes) { ReshapeParamSpec p; - p.shape_size = shape_size; + p.num_shape = num_shape; p.axis = axis; p.num_axes = num_axes; - if (shape_dims != nullptr && shape_size != 0) { - for (int i = 0; i < shape_size; i++) { - p.shape_dims[i] = shape_dims[i]; + if (shape != nullptr && num_shape != 0) { + for (int i = 0; i < num_shape; i++) { + p.shape[i] = shape[i]; } } return p; @@ -811,12 +893,12 @@ inline ClipParamSpec createClipParamSpec(float min, float max) return p; } -inline SqueezeParamSpec createSqueezeParamSpec(int *axes, int axes_num) +inline SqueezeParamSpec createSqueezeParamSpec(int *axes, int num_axes) { SqueezeParamSpec p; - p.axes_num = axes_num; - if (axes != nullptr && axes_num != 0) { - for (int i = 0; i < axes_num; i++) { + p.num_axes = num_axes; + if (axes != nullptr && num_axes != 0) { + for (int i = 0; i < num_axes; i++) { p.axes[i] = axes[i]; } } diff --git a/common/uni/include/profiling.h b/common/uni/include/profiling.h index e987be19..31e29740 100644 --- a/common/uni/include/profiling.h +++ b/common/uni/include/profiling.h @@ -18,15 +18,21 @@ double ut_time_ms(); void ut_time_init(); +void ut_time_start(); +void ut_time_stop(); void ut_time_process( const std::string &name, const std::string &category, double time_start_ms, double time_end_ms); void ut_time_statistics(); #ifdef _PROFILE_STATISTICS #define UNI_TIME_INIT ut_time_init(); +#define UNI_TIME_START ut_time_start(); +#define UNI_TIME_STOP ut_time_stop(); #define UNI_TIME_STATISTICS ut_time_statistics(); #else #define UNI_TIME_INIT +#define UNI_TIME_START +#define UNI_TIME_STOP #define UNI_TIME_STATISTICS #endif diff --git a/common/uni/include/secure_c_wrapper.h b/common/uni/include/secure_c_wrapper.h new file mode 100644 index 00000000..06b3e9aa --- /dev/null +++ b/common/uni/include/secure_c_wrapper.h @@ -0,0 +1,66 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_SECURE_C_WRAPPER +#define _H_SECURE_C_WRAPPER +#ifdef _USE_SECURE_C +#include +#else +#include +#endif + +#include "error.h" + +inline void UNI_MEMCPY(void *dst, const void *src, size_t size) +{ + if (src == dst || size == 0) { + return; + } + if (dst == NULL || src == NULL) { + printf("cpu memcpy error dst:%p src:%p bytes:%d.\n", dst, src, (int)size); + } + //UNI_DEBUG_LOG("cpu memcpy dst:%p src:%p bytes:%d.\n", dst, src, (int)size); +#ifdef _USE_SECURE_C + memcpy_s(dst, size, src, size); +#else + memcpy(dst, src, size); +#endif +} + +inline void UNI_MEMSET(void *dst, int c, size_t size) +{ +#ifdef _USE_SECURE_C + memset_s(dst, size, c, size); +#else + memset(dst, c, size); +#endif +} + +inline void UNI_STRCPY(char *dst, const char *src) +{ +#ifdef _USE_SECURE_C + strcpy_s(dst, strlen(src) + 1, src); +#else + strcpy(dst, src); +#endif +} + +#ifdef _USE_SECURE_C +#define UNI_SSCANF sscanf_s +#define UNI_SNPRINTF snprintf_truncated_s +#else +#define UNI_SSCANF sscanf +#define UNI_SNPRINTF snprintf +#endif + +#endif diff --git a/common/uni/include/thread_affinity.h b/common/uni/include/thread_affinity.h index db21b777..2f63dc00 100644 --- a/common/uni/include/thread_affinity.h +++ b/common/uni/include/thread_affinity.h @@ -17,15 +17,14 @@ #ifndef _WIN32 #include #include -#endif -#ifdef _USE_OPENMP -#include +#else +#include #endif #include -#include #include "sys.h" #include "error.h" #include "data_type.h" +#include "affinity_policy.h" #ifdef _USE_X86 #define __cpuid(data, eaxIn, ecxIn) \ @@ -34,53 +33,6 @@ : "0"(eaxIn), "2"(ecxIn)) #endif -const int CPU_MAX_NUMBER = 128; -#ifdef _USE_OPENMP -#define OMP_MAX_NUM_THREADS \ - (getenv("OMP_NUM_THREADS") == NULL ? omp_get_num_procs() : atoi(getenv("OMP_NUM_THREADS"))) -#else -#define OMP_MAX_NUM_THREADS 1 -#endif -extern int OMP_NUM_THREADS; - -typedef enum { - AFFINITY_CPU_LOW_POWER = 0, - AFFINITY_CPU_HIGH_PERFORMANCE = 1, - AFFINITY_GPU = 2 -} AffinityPolicy; - -typedef struct CpuStat { - unsigned long idle; - unsigned long total; -} CpuStat; - -typedef struct DeviceInfo { - int cpuNum; - Arch archs[CPU_MAX_NUMBER]; - long freqs[CPU_MAX_NUMBER]; - float occupys[CPU_MAX_NUMBER]; - int cpuids[CPU_MAX_NUMBER]; - CpuStat cpuStats[CPU_MAX_NUMBER]; - - float maxOccupy; - AffinityPolicy affinityPolicy; - Arch schedule; -} DeviceInfo; - -inline const char *const *AffinityPolicyNames() -{ - static const char *const names[] = { - "CPU_AFFINITY_LOW_POWER", "CPU_AFFINITY_HIGH_PERFORMANCE", "GPU"}; - return names; -} - -inline const AffinityPolicy *AffinityPolicies() -{ - static const AffinityPolicy policies[] = { - AFFINITY_CPU_LOW_POWER, AFFINITY_CPU_HIGH_PERFORMANCE, AFFINITY_GPU}; - return policies; -} - inline int get_cpus_num() { int cpuNum = 0; @@ -166,7 +118,7 @@ inline void get_cpus_arch(Arch *archs, int cpuNum) } const int bufferSize = 1024; char buffer[bufferSize]; - while (!feof(fp)) { + while (!feof(fp) && cpuid < cpuNum) { char *status = fgets(buffer, bufferSize, fp); if (!status) { break; @@ -175,7 +127,7 @@ inline void get_cpus_arch(Arch *archs, int cpuNum) if (memcmp(buffer, "CPU part", 8) == 0) { Arch arch = ARM_V8; int id = 0; - sscanf(buffer, "CPU part\t: %x", &id); + UNI_SSCANF(buffer, "CPU part\t: %x", &id); switch (id) { case 0xc07: arch = ARM_V7; @@ -244,7 +196,7 @@ inline void get_cpus_arch(Arch *archs, int cpuNum) arch = ARM_V8; break; default: - UNI_WARNING_LOG("unknown CPU %d arch %x, set to ARM_V8\n", cpuid, id); + UNI_DEBUG_LOG("unknown CPU %d arch %x, set to ARM_V8\n", cpuid, id); break; } archs[cpuid++] = arch; @@ -257,6 +209,28 @@ inline void get_cpus_arch(Arch *archs, int cpuNum) } } +inline Arch get_cpu_arch() +{ + static bool blank = true; + static Arch arch = CPU_GENERAL; + if (blank) { + UNI_THREAD_SAFE({ + if (blank) { + int num = get_cpus_num(); + Arch archs[CPU_MAX_NUMBER]; + get_cpus_arch(archs, num); + for (int i = 0; i < num; i++) { + if (archs[i] > arch) { + arch = archs[i]; + } + } + blank = false; + } + }); + } + return arch; +} + inline long get_cpu_freq(int cpuid) { long maxFrequency = -1; @@ -264,24 +238,26 @@ inline long get_cpu_freq(int cpuid) char path[256]; FILE *fp = NULL; if (fp == NULL) { - snprintf( + UNI_SNPRINTF( path, sizeof(path), "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid); fp = fopen(path, "rb"); } if (fp == NULL) { - snprintf( + UNI_SNPRINTF( path, sizeof(path), "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", cpuid); fp = fopen(path, "rb"); } if (fp == NULL) { - snprintf( + UNI_SNPRINTF( path, sizeof(path), "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid); fp = fopen(path, "rb"); } if (fp == NULL) { - UNI_WARNING_LOG("can not get CPU max frequency\n"); + UNI_DEBUG_LOG("can not get CPU max frequency\n"); } else { - fscanf(fp, "%ld", &maxFrequency); + char buffer[32]; + fgets(buffer, 32, fp); + UNI_SSCANF(buffer, "%ld", &maxFrequency); fclose(fp); } #endif @@ -314,7 +290,7 @@ inline void get_cpus_occupy(CpuStat *cpuStat, float *cpuOccupy, int cpuNum) for (int i = 0; i < cpuNum; i++) { fgets(buffer, bufferSize, fp); - sscanf(buffer, "%s %lu %lu %lu %lu %lu %lu %lu", name, &user, &nice, &system, &idle, + UNI_SSCANF(buffer, "%s %lu %lu %lu %lu %lu %lu %lu", name, &user, &nice, &system, &idle, &iowait, &irq, &softirq); total = user + nice + system + idle + iowait + irq + softirq; cpuOccupy[i] = 0; @@ -334,9 +310,9 @@ inline void get_cpus_occupy(CpuStat *cpuStat, float *cpuOccupy, int cpuNum) inline void swap_variable(void *a, void *b, const int size) { char buffer[size]; - memcpy(buffer, a, size); - memcpy(a, b, size); - memcpy(b, buffer, size); + UNI_MEMCPY(buffer, a, size); + UNI_MEMCPY(a, b, size); + UNI_MEMCPY(b, buffer, size); } inline void disable_cpus(float *occupys, int *cpuids, int cpuNum, float cpuOccupyMax) @@ -386,7 +362,19 @@ inline void sort_cpus_by_arch_freq_occupy( inline int set_thread_affinity(int threadid, const int *cpuids, int num) { -#if !(defined(__APPLE__) || defined(_WIN32)) +#ifdef _WIN32 + DWORD_PTR mask = 0x0; + for (int i = 0; i < num; i++) { + UNI_DEBUG_LOG("bind thread %d to core %d\n", threadid, cpuids[i]); + DWORD_PTR m = 0x1; + for (int j = 0; j < cpuids[i]; j++) { + m = m << 1; + } + mask |= m; + } + HANDLE thread = GetCurrentThread(); + SetThreadAffinityMask(thread, mask); +#elif !defined(__APPLE__) UNI_THREADID; cpu_set_t mask; CPU_ZERO(&mask); @@ -396,38 +384,13 @@ inline int set_thread_affinity(int threadid, const int *cpuids, int num) } int status = syscall(__NR_sched_setaffinity, tid, sizeof(mask), &mask); if (status) { - UNI_WARNING_LOG("fail to set affinity %d\n", status); + UNI_DEBUG_LOG("fail to set affinity %d\n", status); return -1; } #endif return 0; } -inline AffinityPolicy thread_affinity_get_policy_by_name(const char *name) -{ - int nameLength = strlen(name); - for (int i = 0; i < 3; i++) { - const char *target = AffinityPolicyNames()[i]; - int targetLength = strlen(target); - if (nameLength < targetLength) { - continue; - } - int match = 1; - for (int j = 0; j < targetLength; j++) { - if (name[j] == target[j] || name[j] == target[j] + 32) { - continue; - } else { - match = 0; - break; - } - } - if (match) { - return AffinityPolicies()[i]; - } - } - return AFFINITY_CPU_HIGH_PERFORMANCE; -} - inline Arch thread_affinity_set_by_policy( Arch *archs, int *cpuids, int cpuNum, AffinityPolicy policy, int threadId) { @@ -435,7 +398,9 @@ inline Arch thread_affinity_set_by_policy( UNI_WARNING_LOG("can not allocate more cores for thread %d\n", threadId); return CPU_GENERAL; } - if (policy == AFFINITY_GPU) { + if (policy == AFFINITY_CPU) { + return archs[cpuNum - 1]; + } else if (policy == AFFINITY_GPU) { return MALI; } #ifndef _USE_OPENMP @@ -481,6 +446,12 @@ inline Arch thread_affinity_set_by_policy( candidates[count++] = i; } } + if (OMP_NUM_THREADS > count) { + count = 0; + for (int i = 0; i < cpuNum; i++) { + candidates[count++] = i; + } + } set_thread_affinity(threadId, candidates, count); Arch arch = archs[index]; #endif @@ -546,21 +517,4 @@ inline void set_cpu_dynamic(DeviceInfo *deviceInfo, int threadId) deviceInfo->schedule = MALI; } } - -inline void set_cpu_num_threads(int threadNum) -{ -#ifndef _USE_OPENMP - if (threadNum > 1) { - UNI_WARNING_LOG("this library not support multi-threads parallel, please rebuild with " - "--openmp option.\n"); - } -#endif - if (threadNum < 0) { - threadNum = 1; - } - if (threadNum > OMP_MAX_NUM_THREADS) { - threadNum = OMP_MAX_NUM_THREADS; - } - OMP_NUM_THREADS = threadNum; -} #endif diff --git a/common/uni/include/uni.h b/common/uni/include/uni.h index 1af06c54..99499ccc 100644 --- a/common/uni/include/uni.h +++ b/common/uni/include/uni.h @@ -14,39 +14,21 @@ #ifndef _H_UNI #define _H_UNI -#include -#include - #include "sys.h" #include "data_type.h" #include "operator_type.h" #include "parameter_spec.h" #include "error.h" #include "array_transpose.h" +#include "memory_cpu.h" +#include "affinity_policy.h" #define UNUSED(x) (void)x #define UNI_MIN(a, b) (((a) < (b)) ? (a) : (b)) #define UNI_MAX(a, b) (((a) > (b)) ? (a) : (b)) #define UNI_ABS(a) (((a) > 0) ? (a) : (-1 * (a))) #define UNI_SIGN(a) (((a) > 0) ? 1 : (((a) < 0) ? -1 : 0)) +#define UNI_ALIGN(a, b) (((a + b - 1) / b) * b) #define UNI_F16_MIN -65504.0f #define UNI_F16_MAX 65504.0f - -inline int UNI_ISNAN(float x) -{ - return isnan(x); -} - -inline int UNI_ISINF(float x) -{ - return isinf(x); -} - -inline void UNI_MEMCPY(void *dst, const void *src, int size) -{ - if (src == dst || size <= 0 || dst == nullptr || src == nullptr) { - return; - } - memcpy(dst, src, size); -} #endif diff --git a/common/uni/include/ut_util.h b/common/uni/include/ut_util.h index e8447ff5..414f3a07 100644 --- a/common/uni/include/ut_util.h +++ b/common/uni/include/ut_util.h @@ -14,7 +14,7 @@ #ifndef _H_UT_UTIL #define _H_UT_UTIL -#include +#include #include "sys.h" #include "uni.h" @@ -181,6 +181,10 @@ inline void ut_check_v( a = ((INT8 *)A)[i]; b = ((INT8 *)B)[i]; break; + case DT_U8: + a = ((U8 *)A)[i]; + b = ((U8 *)B)[i]; + break; case DT_BIN11: a = ((BIN8 *)A)[i]; b = ((BIN8 *)B)[i]; @@ -217,6 +221,9 @@ inline void ut_check_v(void *A, F32 val, U32 len, DataType dt, const char *file, case DT_U32: a = ((U32 *)A)[i]; break; + case DT_U8: + a = ((U8 *)A)[i]; + break; case DT_BIN11: a = ((BIN8 *)A)[i]; break; @@ -245,10 +252,10 @@ inline void ut_check_a(void *A, void *B, U32 len, DataType dt) switch (dt) { case DT_F32: case DT_F16: - memcpy(threshold, threshold_float, sizeof(F32) * num); + UNI_MEMCPY(threshold, threshold_float, sizeof(F32) * num); break; case DT_U8: - memcpy(threshold, threshold_int8, sizeof(F32) * num); + UNI_MEMCPY(threshold, threshold_int8, sizeof(F32) * num); break; default: UNI_ERROR_LOG("unsupported data type.\n"); @@ -274,11 +281,11 @@ inline void ut_check_a(void *A, void *B, U32 len, DataType dt) break; } - if (UNI_ISNAN((float)a) || UNI_ISINF((float)a)) { + if (isnan((float)a) || isinf((float)a)) { UNI_ERROR_LOG("nan or inf value in ut_check_a of input A\n"); return; } - if (UNI_ISNAN((float)b) || UNI_ISINF((float)b)) { + if (isnan((float)b) || isinf((float)b)) { UNI_ERROR_LOG("nan or inf value in ut_check_a of input B\n"); return; } diff --git a/common/uni/include/x86_avx2_expand.h b/common/uni/include/x86_avx2_expand.h index b8d422f7..fbcca54e 100644 --- a/common/uni/include/x86_avx2_expand.h +++ b/common/uni/include/x86_avx2_expand.h @@ -30,22 +30,31 @@ inline unsigned int _mm256_hadd_u32(__m256i x) inline __m256 _mm256_log_ps(__m256 x) { - static const __m256 CONST_one = _mm256_set1_ps(1.0f); - static const __m256 CONST_two = _mm256_set1_ps(2.0f); - static const __m256 CONST_neg_one = _mm256_set1_ps(-1.0f); - F32 i = 30; - __m256 n = _mm256_set1_ps(i); - __m256 nk = _mm256_add_ps(_mm256_mul_ps(CONST_two, n), CONST_one); - x = _mm256_div_ps(_mm256_add_ps(x, CONST_neg_one), _mm256_add_ps(x, CONST_one)); - __m256 xx = _mm256_mul_ps(x, x); - __m256 y = _mm256_div_ps(CONST_one, nk); - for (; i > 0; i--) { - nk = _mm256_sub_ps(nk, CONST_two); - y = _mm256_add_ps(_mm256_div_ps(CONST_one, nk), _mm256_mul_ps(xx, y)); - } - - y = _mm256_mul_ps(CONST_two, _mm256_mul_ps(x, y)); - return y; + __m256i ux = _mm256_castps_si256(x); + __m256 fx = _mm256_cvtepi32_ps(ux); + fx = _mm256_mul_ps(fx, + _mm256_div_ps( + _mm256_set1_ps(1.0f), _mm256_cvtepi32_ps(_mm256_slli_epi32(_mm256_set1_epi32(1), 23)))); + + __m256i umx = _mm256_or_si256(_mm256_and_si256(ux, _mm256_set1_epi32(0x007FFFFF)), + _mm256_slli_epi32(_mm256_set1_epi32(0x7e), 23)); + __m256 mx = _mm256_castsi256_ps(umx); + + const __m256 c_124_22551499 = _mm256_set1_ps(124.22551499f); + const __m256 c_1_498030302 = _mm256_set1_ps(1.498030302f); + const __m256 c_1_725877999 = _mm256_set1_ps(1.72587999f); + const __m256 c_0_3520087068 = _mm256_set1_ps(0.3520887068f); + + __m256 tmp = _mm256_div_ps(c_1_725877999, _mm256_add_ps(c_0_3520087068, mx)); + tmp = _mm256_add_ps(c_124_22551499, tmp); + tmp = _mm256_fmadd_ps(c_1_498030302, mx, tmp); + const __m256 c_0_69314718 = _mm256_set1_ps(0.69314718f); + __m256 result_v = _mm256_mul_ps(_mm256_sub_ps(fx, tmp), c_0_69314718); + result_v = _mm256_blendv_ps( + result_v, _mm256_set1_ps(NAN), _mm256_cmp_ps(x, _mm256_set1_ps(0), _CMP_LT_OS)); + result_v = _mm256_blendv_ps( + result_v, _mm256_set1_ps(-INFINITY), _mm256_cmp_ps(x, _mm256_set1_ps(0), _CMP_EQ_OS)); + return result_v; } inline __m256 _mm256_exp_ps(__m256 x) @@ -121,6 +130,17 @@ inline F32 _mm256_sum_ps(__m256 x) return _mm_cvtss_f32(sum); } +inline I32 _mm256_sum_epi32(__m256i x) +{ + __m128i low = _mm256_extractf128_si256(x, 0); + __m128i high = _mm256_extractf128_si256(x, 1); + __m128i sum = _mm_hadd_epi32(low, high); + low = _mm_hadd_epi32(sum, sum); + high = _mm_shuffle_epi32(low, 0b01); + sum = _mm_add_epi32(low, high); + return _mm_cvtsi128_si32(sum); +} + // horizontal min inline F32 _mm256_hmin_ps(__m256 x) { diff --git a/common/uni/src/CMakeLists.txt b/common/uni/src/CMakeLists.txt index ef8301af..3ba24ae9 100644 --- a/common/uni/src/CMakeLists.txt +++ b/common/uni/src/CMakeLists.txt @@ -6,6 +6,10 @@ add_library(${PROJECT_NAME} SHARED ${srcs}) # static library add_library(${PROJECT_NAME}_static STATIC ${srcs}) +if (USE_SECURE_C) + target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${SECUREC_SHARED_LIBRARY}) +endif () + set_target_properties(${PROJECT_NAME}_static PROPERTIES OUTPUT_NAME "${PROJECT_NAME}") set_target_properties(${PROJECT_NAME} PROPERTIES CLEAN_DIRECT_OUTPUT 1) set_target_properties(${PROJECT_NAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1) diff --git a/common/uni/src/profiling.cpp b/common/uni/src/profiling.cpp index 9271184f..b7e90851 100644 --- a/common/uni/src/profiling.cpp +++ b/common/uni/src/profiling.cpp @@ -25,9 +25,15 @@ int OMP_NUM_THREADS = OMP_MAX_NUM_THREADS; #ifdef _THREAD_SAFE pthread_mutex_t uniThreadMutex = PTHREAD_MUTEX_INITIALIZER; #endif -std::map time_statistics; +static std::map time_statistics; +static bool time_statistics_flag = true; #ifndef _EAGER_LOG -std::vector logs; +static std::vector logs; +#endif + +#ifdef _USE_MEM_CHECK +#include "memory_cpu.h" +std::map mem_statistics; #endif double ut_time_ms() @@ -40,7 +46,20 @@ double ut_time_ms() void ut_time_init() { - UNI_THREAD_SAFE(time_statistics.clear()); + UNI_THREAD_SAFE({ + time_statistics.clear(); + time_statistics_flag = true; + }); +} + +void ut_time_start() +{ + UNI_THREAD_SAFE({ time_statistics_flag = true; }); +} + +void ut_time_stop() +{ + UNI_THREAD_SAFE({ time_statistics_flag = false; }); } inline std::string ut_profile_log(const std::string &name, @@ -84,6 +103,9 @@ void ut_time_process( #endif #endif + if (!time_statistics_flag) { + return; + } #ifdef _PROFILE_STATISTICS double duration = time_end_ms - time_start_ms; UNI_THREAD_SAFE({ @@ -99,6 +121,9 @@ void ut_time_process( void ut_time_statistics() { #ifndef _EAGER_LOG + printf("\nFunction Time:\n{\"name\": function name, \"cat\": function category, \"ph\": " + "function type, \"pid\": process id, \"tid\": thread id, \"ts\": start time(ms), " + "\"dur\": duration time(vs, gpu will have 1 ms synchronization overhead)\n"); for (unsigned int i = 0; i < logs.size(); i++) { UNI_PROFILE_LOG("%s\n", logs[i].c_str()); } diff --git a/compute/blas_enhance/src/CMakeLists.txt b/compute/blas_enhance/src/CMakeLists.txt index 00fb24a4..24fda937 100644 --- a/compute/blas_enhance/src/CMakeLists.txt +++ b/compute/blas_enhance/src/CMakeLists.txt @@ -29,6 +29,8 @@ if (USE_NEON) if (USE_INT8) file(GLOB arm_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/*.cpp) if (USE_FP16) + file(GLOB armv8_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/v8.2/*.cpp) + elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64") file(GLOB armv8_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/v8/*.cpp) else () file(GLOB armv8_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/v7/*.cpp) @@ -47,6 +49,9 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # shared library add_library(${PROJECT_NAME} SHARED ${srcs}) target_link_libraries(${PROJECT_NAME} LINK_PUBLIC uni) +if (USE_SECURE_C) + target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${SecureC_SHARED_LIBRARY}) +endif () # static library add_library(${PROJECT_NAME}_static STATIC ${srcs}) diff --git a/compute/blas_enhance/src/axpby.cpp b/compute/blas_enhance/src/axpby.cpp index 6f7cb448..6cc6fd02 100644 --- a/compute/blas_enhance/src/axpby.cpp +++ b/compute/blas_enhance/src/axpby.cpp @@ -18,6 +18,9 @@ #ifdef _USE_NEON #include "cpu/arm/blas_arm.h" #endif +#ifdef _USE_X86 +#include "cpu/x86/blas_x86.h" +#endif EE vector_vector_axpby( F32 a, TensorDesc xDesc, const void *x, F32 b, TensorDesc yDesc, void *y, Arch arch) @@ -45,8 +48,12 @@ EE vector_vector_axpby( ret = axpby_general(yLen, yDataType, a, x, b, y); #endif #ifdef _USE_NEON - } else { + } else if (IS_ARM(arch)) { ret = axpby_arm(yLen, yDataType, a, x, b, y, arch); +#endif +#ifdef _USE_X86 + } else if (IS_X86(arch)) { + ret = axpby_x86(yLen, yDataType, a, x, b, y); #endif } return ret; diff --git a/compute/blas_enhance/src/cpu/arm/axpby.cpp b/compute/blas_enhance/src/cpu/arm/axpby.cpp index 681ac07c..f13510bf 100644 --- a/compute/blas_enhance/src/cpu/arm/axpby.cpp +++ b/compute/blas_enhance/src/cpu/arm/axpby.cpp @@ -11,8 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include "error.h" - #include "cpu/arm/blas_arm.h" #ifdef _USE_FP16 #include "cpu/arm/fp16/blas_fp16.h" @@ -23,13 +21,10 @@ EE axpby_arm(U32 len, DataType dt, F32 a, const void *x, F32 b, void *y, Arch arch) { - EE ret = SUCCESS; + EE ret = NOT_SUPPORTED; switch (dt) { #ifdef _USE_FP16 case DT_F16: - if (ARM_A55 != arch && ARM_A76 != arch) { - return NOT_SUPPORTED; - } ret = axpby_fp16(len, a, (F16 *)x, b, (F16 *)y); break; #endif @@ -39,7 +34,6 @@ EE axpby_arm(U32 len, DataType dt, F32 a, const void *x, F32 b, void *y, Arch ar break; #endif default: - ret = NOT_SUPPORTED; break; } return ret; diff --git a/compute/blas_enhance/src/cpu/arm/fp16/mmm_common.h b/compute/blas_enhance/src/cpu/arm/fp16/mmm_common.h index 9b618c59..f879ba01 100644 --- a/compute/blas_enhance/src/cpu/arm/fp16/mmm_common.h +++ b/compute/blas_enhance/src/cpu/arm/fp16/mmm_common.h @@ -13,7 +13,7 @@ #ifndef _H_MMM_COMMON #define _H_MMM_COMMON -#include + #include #include "data_type.h" #include "uni.h" @@ -41,7 +41,7 @@ inline void matrix2_trans(U32 size, U32 blockK, U32 M, F16 *src, F16 *dst) { for (U32 i = 0; i < blockK; i++) { asm volatile("prfm pldl2keep, [%0, #48]\n" : "+r"(src) : : "memory", "cc"); - memcpy(dst, src, size * sizeof(F16)); + UNI_MEMCPY(dst, src, size * sizeof(F16)); dst += size; src += M; } diff --git a/compute/blas_enhance/src/cpu/arm/fp16/mvm.cpp b/compute/blas_enhance/src/cpu/arm/fp16/mvm.cpp index 2b8af932..baedff5f 100644 --- a/compute/blas_enhance/src/cpu/arm/fp16/mvm.cpp +++ b/compute/blas_enhance/src/cpu/arm/fp16/mvm.cpp @@ -31,7 +31,7 @@ EE matrix_vector_multiply_transform_weight_fp16(TensorDesc desc, F16 *src, F16 * matrix1_trans(64, K, K, src + i * K, dst + i * K); } if (i < (int)N) { - memcpy(dst + i * K, src + i * K, (N - i) * K * bytesOf(DT_F16)); + UNI_MEMCPY(dst + i * K, src + i * K, (N - i) * K * bytesOf(DT_F16)); } break; } diff --git a/compute/blas_enhance/src/cpu/arm/fp32/axpby.cpp b/compute/blas_enhance/src/cpu/arm/fp32/axpby.cpp index a1761246..36e5e6f5 100644 --- a/compute/blas_enhance/src/cpu/arm/fp32/axpby.cpp +++ b/compute/blas_enhance/src/cpu/arm/fp32/axpby.cpp @@ -11,7 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include "error.h" #include "cpu/arm/fp32/blas_fp32.h" EE axpby_fp32(U32 len, F32 a, const F32 *x, F32 b, F32 *y) diff --git a/compute/blas_enhance/src/cpu/arm/fp32/blas_fp32.h b/compute/blas_enhance/src/cpu/arm/fp32/blas_fp32.h index 4517bd72..2e5c5d21 100644 --- a/compute/blas_enhance/src/cpu/arm/fp32/blas_fp32.h +++ b/compute/blas_enhance/src/cpu/arm/fp32/blas_fp32.h @@ -66,7 +66,7 @@ inline void matrix2_trans(U32 size, U32 blockK, U32 M, F32 *src, F32 *dst) if (i % 16 == 0) { __builtin_prefetch(src + 16); } - memcpy(dst, src, size * sizeof(F32)); + UNI_MEMCPY(dst, src, size * sizeof(F32)); dst += size; src += M; } diff --git a/compute/blas_enhance/src/cpu/arm/fp32/mvm.cpp b/compute/blas_enhance/src/cpu/arm/fp32/mvm.cpp index 0ce3b7ae..e695f424 100644 --- a/compute/blas_enhance/src/cpu/arm/fp32/mvm.cpp +++ b/compute/blas_enhance/src/cpu/arm/fp32/mvm.cpp @@ -28,7 +28,7 @@ EE matrix_vector_multiply_transform_weight_fp32(TensorDesc desc, F32 *src, F32 * matrix1_trans(16, K, K, src + i * K, dst + i * K); } if (i < (int)N) { - memcpy(dst + i * K, src + i * K, (N - i) * K * bytesOf(DT_F32)); + UNI_MEMCPY(dst + i * K, src + i * K, (N - i) * K * bytesOf(DT_F32)); } break; } diff --git a/compute/blas_enhance/src/cpu/arm/int8/blas_matrix_transpose.h b/compute/blas_enhance/src/cpu/arm/int8/blas_matrix_transpose.h index e711b16c..436bac54 100644 --- a/compute/blas_enhance/src/cpu/arm/int8/blas_matrix_transpose.h +++ b/compute/blas_enhance/src/cpu/arm/int8/blas_matrix_transpose.h @@ -14,11 +14,10 @@ #ifndef _H_BLAS_MATRIX_TRANSPOSE #define _H_BLAS_MATRIX_TRANSPOSE -#include #include #include "data_type.h" -#ifndef __aarch64__ +#ifndef _USE_FP16 inline void matrix1_trans_int8(U32 size, U32 blockK, U32 K, INT8 *src, INT8 *dst) { INT8 *src1 = src; @@ -33,7 +32,7 @@ inline void matrix1_trans_int8(U32 size, U32 blockK, U32 K, INT8 *src, INT8 *dst } U32 K4 = pad_to_4_multiple(blockK); for (U32 i = 0; i < K4 - blockK; i++) { - memset(dst, 0, size * sizeof(INT8)); + UNI_MEMSET(dst, 0, size * sizeof(INT8)); dst += size; } } @@ -44,13 +43,13 @@ inline void matrix2_trans_int8(U32 size, U32 blockK, U32 M, INT8 *src, INT8 *dst if (i % 16 == 0) { __builtin_prefetch(src + 16); } - memcpy(dst, src, size * sizeof(INT8)); + UNI_MEMCPY(dst, src, size * sizeof(INT8)); dst += size; src += M; } U32 K4 = pad_to_4_multiple(blockK); for (U32 i = 0; i < K4 - blockK; i++) { - memset(dst, 0, size * sizeof(INT8)); + UNI_MEMSET(dst, 0, size * sizeof(INT8)); dst += size; } } @@ -67,19 +66,19 @@ inline void matrix1_trans_n8(U32 blockK, U32 K, INT8 *src, INT8 *dst) U32 k = 0; for (; k < blockK - 7; k += 8) { if (k % 64 == 0) { - asm volatile("prfm pldl2keep, [%[in0], 64]\n" - "prfm pldl2keep, [%[in1], 64]\n" - "prfm pldl2keep, [%[in2], 64]\n" - "prfm pldl2keep, [%[in3], 64]\n" - "prfm pldl2keep, [%[in4], 64]\n" - "prfm pldl2keep, [%[in5], 64]\n" - "prfm pldl2keep, [%[in6], 64]\n" - "prfm pldl2keep, [%[in7], 64]\n" - : [in0] "+r"(in[0]), [in1] "+r"(in[1]), [in2] "+r"(in[2]), - [in3] "+r"(in[3]), [in4] "+r"(in[4]), [in5] "+r"(in[5]), [in6] "+r"(in[6]), - [in7] "+r"(in[7]) - : - : "memory", "cc"); + asm volatile( + "prfm pldl2keep, [%[in0], 64]\n" + "prfm pldl2keep, [%[in1], 64]\n" + "prfm pldl2keep, [%[in2], 64]\n" + "prfm pldl2keep, [%[in3], 64]\n" + "prfm pldl2keep, [%[in4], 64]\n" + "prfm pldl2keep, [%[in5], 64]\n" + "prfm pldl2keep, [%[in6], 64]\n" + "prfm pldl2keep, [%[in7], 64]\n" + : [in0] "+r"(in[0]), [in1] "+r"(in[1]), [in2] "+r"(in[2]), [in3] "+r"(in[3]), + [in4] "+r"(in[4]), [in5] "+r"(in[5]), [in6] "+r"(in[6]), [in7] "+r"(in[7]) + : + : "memory", "cc"); } asm volatile("ldr d0, [%[in0]], 8\n" "ldr d1, [%[in1]], 8\n" @@ -199,27 +198,27 @@ inline void matrix2_trans_m12(U32 blockK, U32 M, INT8 *src, INT8 *dst) } src1 += offset; - asm volatile("ldr d0, [%[in0]]\n" - "ldr d1, [%[in1]]\n" - "ldr d2, [%[in2]]\n" - "ldr d3, [%[in3]]\n" - "zip1 v4.8b, v0.8b, v1.8b\n" - "zip2 v5.8b, v0.8b, v1.8b\n" - "zip1 v6.8b, v2.8b, v3.8b\n" - "zip2 v7.8b, v2.8b, v3.8b\n" + asm volatile( + "ldr d0, [%[in0]]\n" + "ldr d1, [%[in1]]\n" + "ldr d2, [%[in2]]\n" + "ldr d3, [%[in3]]\n" + "zip1 v4.8b, v0.8b, v1.8b\n" + "zip2 v5.8b, v0.8b, v1.8b\n" + "zip1 v6.8b, v2.8b, v3.8b\n" + "zip2 v7.8b, v2.8b, v3.8b\n" - "zip1 v0.4h, v4.4h, v6.4h\n" - "zip2 v1.4h, v4.4h, v6.4h\n" - "zip1 v2.4h, v5.4h, v7.4h\n" - "zip2 v3.4h, v5.4h, v7.4h\n" - "str d0, [%[out]]\n" - "str d1, [%[out], 8]\n" - "str d2, [%[out], 16]\n" - "str d3, [%[out], 24]\n" - : - : [in0] "r"(in12[0]), [in1] "r"(in12[1]), [in2] "r"(in12[2]), - [in3] "r"(in12[3]), [out] "r"(dst1) - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); + "zip1 v0.4h, v4.4h, v6.4h\n" + "zip2 v1.4h, v4.4h, v6.4h\n" + "zip1 v2.4h, v5.4h, v7.4h\n" + "zip2 v3.4h, v5.4h, v7.4h\n" + "str d0, [%[out]]\n" + "str d1, [%[out], 8]\n" + "str d2, [%[out], 16]\n" + "str d3, [%[out], 24]\n" + : + : [in0] "r"(in12[0]), [in1] "r"(in12[1]), [in2] "r"(in12[2]), [in3] "r"(in12[3]), [out] "r"(dst1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); for (U32 j = 0; j < 4; j++) { for (U32 k = 0; k < 4; k++) { dst1[32 + j * 4 + k] = in12[k][8 + j]; @@ -241,27 +240,27 @@ inline void matrix2_trans_m12(U32 blockK, U32 M, INT8 *src, INT8 *dst) } } - asm volatile("ldr d0, [%[in0]]\n" - "ldr d1, [%[in1]]\n" - "ldr d2, [%[in2]]\n" - "ldr d3, [%[in3]]\n" - "zip1 v4.8b, v0.8b, v1.8b\n" - "zip2 v5.8b, v0.8b, v1.8b\n" - "zip1 v6.8b, v2.8b, v3.8b\n" - "zip2 v7.8b, v2.8b, v3.8b\n" + asm volatile( + "ldr d0, [%[in0]]\n" + "ldr d1, [%[in1]]\n" + "ldr d2, [%[in2]]\n" + "ldr d3, [%[in3]]\n" + "zip1 v4.8b, v0.8b, v1.8b\n" + "zip2 v5.8b, v0.8b, v1.8b\n" + "zip1 v6.8b, v2.8b, v3.8b\n" + "zip2 v7.8b, v2.8b, v3.8b\n" - "zip1 v0.4h, v4.4h, v6.4h\n" - "zip2 v1.4h, v4.4h, v6.4h\n" - "zip1 v2.4h, v5.4h, v7.4h\n" - "zip2 v3.4h, v5.4h, v7.4h\n" - "str d0, [%[out]]\n" - "str d1, [%[out], 8]\n" - "str d2, [%[out], 16]\n" - "str d3, [%[out], 24]\n" - : - : [in0] "r"(in12[0]), [in1] "r"(in12[1]), [in2] "r"(in12[2]), - [in3] "r"(in12[3]), [out] "r"(dst1) - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); + "zip1 v0.4h, v4.4h, v6.4h\n" + "zip2 v1.4h, v4.4h, v6.4h\n" + "zip1 v2.4h, v5.4h, v7.4h\n" + "zip2 v3.4h, v5.4h, v7.4h\n" + "str d0, [%[out]]\n" + "str d1, [%[out], 8]\n" + "str d2, [%[out], 16]\n" + "str d3, [%[out], 24]\n" + : + : [in0] "r"(in12[0]), [in1] "r"(in12[1]), [in2] "r"(in12[2]), [in3] "r"(in12[3]), [out] "r"(dst1) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); for (U32 j = 0; j < 4; j++) { for (U32 k = 0; k < 4; k++) { dst1[32 + j * 4 + k] = in12[k][8 + j]; diff --git a/compute/blas_enhance/src/cpu/arm/int8/mvm.cpp b/compute/blas_enhance/src/cpu/arm/int8/mvm.cpp index 637a0007..a57e532c 100644 --- a/compute/blas_enhance/src/cpu/arm/int8/mvm.cpp +++ b/compute/blas_enhance/src/cpu/arm/int8/mvm.cpp @@ -15,6 +15,7 @@ #include "cpu/arm/blas_arm.h" #include "cpu/arm/int8/blas_matrix_transpose.h" #include "arm_neon_expand.h" +#include "uni.h" #define ALIGN 32 @@ -28,7 +29,7 @@ EE matrix_vector_multiply_transform_weight_int8(TensorDesc desc, INT8 *src, INT8 switch (desc.df) { case DF_NORMAL: { CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); -#ifdef __aarch64__ +#ifdef _USE_FP16 U32 K4 = pad_to_4_multiple(K); #else U32 K4 = K; @@ -37,13 +38,13 @@ EE matrix_vector_multiply_transform_weight_int8(TensorDesc desc, INT8 *src, INT8 matrix1_trans_int8(ALIGN, K, K, src + i * K, dst + i * K4); } if (i < (int)N) { - memcpy(dst + i * K4, src + i * K, (N - i) * K * bytesOf(DT_I8)); + UNI_MEMCPY(dst + i * K4, src + i * K, (N - i) * K * bytesOf(DT_I8)); } break; } case DF_TRANSPOSE: { CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); -#ifdef __aarch64__ +#ifdef _USE_FP16 U32 K4 = pad_to_4_multiple(K); #else U32 K4 = K; @@ -69,7 +70,7 @@ EE matrix_vector_multiply_transform_weight_int8(TensorDesc desc, INT8 *src, INT8 return ret; } -#ifndef __aarch64__ +#ifndef _USE_FP16 #if 1 void mvm_row_pack(U32 Nbatch, U32 K, INT8 *matrix, INT8 *vector, I32 *result) { @@ -197,7 +198,7 @@ void mvm_row_pack(U32 Nbatch, U32 K, INT8 *matrix, INT8 *vector, I32 *result) inline void mvm_row_unpack(U32 Nbatch, U32 K, INT8 *matrix, INT8 *vector, I32 *result) { U32 N = Nbatch * 8; -#ifdef __aarch64__ +#ifdef _USE_FP16 int8x16_t mat[8]; #else int16x4_t mat[8][2]; @@ -213,7 +214,7 @@ inline void mvm_row_unpack(U32 Nbatch, U32 K, INT8 *matrix, INT8 *vector, I32 *r int32x4_t bias0 = vld1q_s32(result + n); int32x4_t bias1 = vld1q_s32(result + n + 4); int32x4_t res[8] = {0}; -#ifdef __aarch64__ +#ifdef _USE_FP16 for (U32 k = 0; k < K_inner; k += 16) { int8x16_t v = vld1q_s8(vector + k); for (int i = 0; i < 8; i++) { @@ -319,7 +320,7 @@ inline void mvm_col(U32 numRows, U32 numColumns, INT8 *matrix, INT8 *vector, I32 U32 NInner = N - NTail; for (U32 n = 0; n < NInner; n += 64) { - memset(tmp, 0, sizeof(I32) * 64); + UNI_MEMSET(tmp, 0, sizeof(I32) * 64); for (U32 k = 0; k < K; k++) { for (U32 i = 0; i < 64; i++) { tmp[i] += vector[k] * matrix[k * N + n + i]; @@ -331,7 +332,7 @@ inline void mvm_col(U32 numRows, U32 numColumns, INT8 *matrix, INT8 *vector, I32 } } - memset(tmp, 0, sizeof(I32) * 64); + UNI_MEMSET(tmp, 0, sizeof(I32) * 64); for (U32 k = 0; k < K; k++) { for (U32 i = 0; i < NTail; i++) { tmp[i] += vector[k] * matrix[k * N + NInner + i]; diff --git a/compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm.cpp b/compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm.cpp new file mode 100644 index 00000000..52b9f1f6 --- /dev/null +++ b/compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm.cpp @@ -0,0 +1,81 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/arm/int8/v8.2/mmm_v8.h" +#include "cpu/arm/int8/blas_int8.h" +#include "cpu/arm/int8/blas_matrix_transpose.h" +#include "cpu/arm/blas_arm.h" + +EE mmm_int8( + int M, int N, int K, bool transposeA, INT8 *matrix1, INT8 *matrix2, INT8 *tmp, I32 *result, Arch arch) +{ + EE ret = SUCCESS; + switch (arch) { + case ARM_A55: + mmm_A55(M, N, K, transposeA, matrix1, matrix2, tmp, result); + break; + case ARM_A76: + mmm_A76(M, N, K, transposeA, matrix1, matrix2, tmp, result); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE matrix_matrix_multiply_transform_rhsN_int8(TensorDesc desc, INT8 *src, INT8 *dst) +{ + DataType dt; + DataFormat df; + U32 N, K; + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); + U32 K4 = pad_to_4_multiple(K); + int i = 0; + for (; i < (int)N - 11; i += 12) { + matrix2_trans_m12(K, N, src + i, dst + i * K4); + } + for (; i < (int)N - 7; i += 8) { + matrix2_trans_int8(8, K, N, src + i, dst + i * K4); + } + for (; i < (int)N - 3; i += 4) { + matrix2_trans_int8(4, K, N, src + i, dst + i * K4); + } + if ((int)N > i) { + matrix2_trans_int8(N - i, K, N, src + i, dst + i * K4); + } + return SUCCESS; +} + +EE matrix_matrix_multiply_transform_rhsT_int8(TensorDesc desc, INT8 *src, INT8 *dst) +{ + DataType dt; + DataFormat df; + U32 N, K; + CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); + U32 K4 = pad_to_4_multiple(K); + int i = 0; + for (; i < (int)N - 11; i += 12) { + matrix1_trans_int8(12, K, K, src + i * K, dst + i * K4); + } + for (; i < (int)N - 7; i += 8) { + matrix1_trans_int8(8, K, K, src + i * K, dst + i * K4); + } + for (; i < (int)N - 3; i += 4) { + matrix1_trans_int8(4, K, K, src + i * K, dst + i * K4); + } + if ((int)N > i) { + matrix1_trans_int8(N - i, K, K, src + i * K, dst + i * K4); + } + return SUCCESS; +} diff --git a/compute/blas_enhance/src/cpu/arm/int8/v8/mmm_A55.cpp b/compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm_A55.cpp similarity index 99% rename from compute/blas_enhance/src/cpu/arm/int8/v8/mmm_A55.cpp rename to compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm_A55.cpp index 4db08cd3..a4086415 100644 --- a/compute/blas_enhance/src/cpu/arm/int8/v8/mmm_A55.cpp +++ b/compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm_A55.cpp @@ -12,8 +12,8 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "cpu/arm/blas_arm.h" -#include "cpu/arm/int8/v8/mmm_common.h" -#include "cpu/arm/int8/v8/mmm_v8.h" +#include "cpu/arm/int8/v8.2/mmm_common.h" +#include "cpu/arm/int8/v8.2/mmm_v8.h" #include "cpu/arm/int8/blas_matrix_transpose.h" #include "uni.h" diff --git a/compute/blas_enhance/src/cpu/arm/int8/v8/mmm_A76.cpp b/compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm_A76.cpp similarity index 99% rename from compute/blas_enhance/src/cpu/arm/int8/v8/mmm_A76.cpp rename to compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm_A76.cpp index 0495fd81..0ff95dfe 100644 --- a/compute/blas_enhance/src/cpu/arm/int8/v8/mmm_A76.cpp +++ b/compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm_A76.cpp @@ -12,8 +12,8 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "cpu/arm/blas_arm.h" -#include "cpu/arm/int8/v8/mmm_common.h" -#include "cpu/arm/int8/v8/mmm_v8.h" +#include "cpu/arm/int8/v8.2/mmm_common.h" +#include "cpu/arm/int8/v8.2/mmm_v8.h" #include "cpu/arm/int8/blas_matrix_transpose.h" #include "uni.h" diff --git a/compute/blas_enhance/src/cpu/arm/int8/v8/mmm_common.h b/compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm_common.h similarity index 100% rename from compute/blas_enhance/src/cpu/arm/int8/v8/mmm_common.h rename to compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm_common.h diff --git a/compute/blas_enhance/src/cpu/arm/int8/v8/mmm_v8.h b/compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm_v8.h similarity index 100% rename from compute/blas_enhance/src/cpu/arm/int8/v8/mmm_v8.h rename to compute/blas_enhance/src/cpu/arm/int8/v8.2/mmm_v8.h diff --git a/compute/blas_enhance/src/cpu/arm/int8/v8/mmm.cpp b/compute/blas_enhance/src/cpu/arm/int8/v8/mmm.cpp index 8b1e6640..035d0a30 100644 --- a/compute/blas_enhance/src/cpu/arm/int8/v8/mmm.cpp +++ b/compute/blas_enhance/src/cpu/arm/int8/v8/mmm.cpp @@ -11,29 +11,13 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include "cpu/arm/int8/v8/mmm_v8.h" #include "cpu/arm/int8/blas_int8.h" -#include "cpu/arm/int8/blas_matrix_transpose.h" #include "cpu/arm/blas_arm.h" +#include "cpu/arm/int8/blas_matrix_transpose.h" +#include "uni.h" +#include "thread_affinity.h" -EE mmm_int8( - int M, int N, int K, bool transposeA, INT8 *matrix1, INT8 *matrix2, INT8 *tmp, I32 *result, Arch arch) -{ - EE ret = SUCCESS; - switch (arch) { - case ARM_A55: - mmm_A55(M, N, K, transposeA, matrix1, matrix2, tmp, result); - break; - case ARM_A76: - mmm_A76(M, N, K, transposeA, matrix1, matrix2, tmp, result); - break; - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - +static const int tileN = 8; EE matrix_matrix_multiply_transform_rhsN_int8(TensorDesc desc, INT8 *src, INT8 *dst) { DataType dt; @@ -42,15 +26,9 @@ EE matrix_matrix_multiply_transform_rhsN_int8(TensorDesc desc, INT8 *src, INT8 * CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); U32 K4 = pad_to_4_multiple(K); int i = 0; - for (; i < (int)N - 11; i += 12) { - matrix2_trans_m12(K, N, src + i, dst + i * K4); - } for (; i < (int)N - 7; i += 8) { matrix2_trans_int8(8, K, N, src + i, dst + i * K4); } - for (; i < (int)N - 3; i += 4) { - matrix2_trans_int8(4, K, N, src + i, dst + i * K4); - } if ((int)N > i) { matrix2_trans_int8(N - i, K, N, src + i, dst + i * K4); } @@ -65,17 +43,354 @@ EE matrix_matrix_multiply_transform_rhsT_int8(TensorDesc desc, INT8 *src, INT8 * CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); U32 K4 = pad_to_4_multiple(K); int i = 0; - for (; i < (int)N - 11; i += 12) { - matrix1_trans_int8(12, K, K, src + i * K, dst + i * K4); - } for (; i < (int)N - 7; i += 8) { matrix1_trans_int8(8, K, K, src + i * K, dst + i * K4); } - for (; i < (int)N - 3; i += 4) { - matrix1_trans_int8(4, K, K, src + i * K, dst + i * K4); - } if ((int)N > i) { matrix1_trans_int8(N - i, K, K, src + i * K, dst + i * K4); } return SUCCESS; } + +void mmm_4x8(U32 offset, U32 K, INT8 *in, INT8 *w, I32 *out) +{ +#if 1 + int32x4_t ret[tileN][2]; + for (int i = 0; i < tileN; i++) { + for (int j = 0; j < 2; j++) { + ret[i][j] = vld1q_s32(out + i * offset + j * 4); + } + } + int16x8_t c[tileN]; + for (U32 n = 0; n < K; n += 4) { + int8x8_t b0 = vld1_s8(w); + w += 8; + for (int i = 0; i < tileN; i++) { + int8x8_t a0 = vdup_n_s8(in[0]); + c[i] = vmull_s8(a0, b0); + in++; + } + for (U32 j = 0; j < 3; j++) { + int8x8_t b0 = vld1_s8(w); + w += 8; + for (int i = 0; i < tileN; i++) { + int8x8_t a0 = vdup_n_s8(in[0]); + c[i] = vmlal_s8(c[i], a0, b0); + in++; + } + } + for (int i = 0; i < tileN; i++) { + ret[i][0] = vaddw_s16(ret[i][0], vget_low_s16(c[i])); + ret[i][1] = vaddw_s16(ret[i][1], vget_high_s16(c[i])); + } + } + for (int i = 0; i < tileN; i++) { + for (int j = 0; j < 2; j++) { + vst1q_s32(out + i * offset + j * 4, ret[i][j]); + } + } +#else + offset *= 4; + asm volatile("mov x3, %0\n" + "ld1r {v0.8b}, [x3]\n" + "ld1r {v1.8b}, [x3]!\n" + "ld1r {v2.8b}, [x3]!\n" + "ld1r {v3.8b}, [x3]!\n" + //"ld1r {v4.8b}, [x3]!\n" + //"ld1r {v5.8b}, [x3]!\n" + + "mov x0, %1\n" + "ldp d6, d7, [x0]!\n" + + // give out address to x26 + "mov x26, %2\n" + + // load in bias + "ldp q8, q9, [x26]\n" + "add x26, x26, %4\n" + "ldp q10, q11, [x26]\n" + "add x26, x26, %4\n" + "ldp q12, q13, [x26]\n" + "add x26, x26, %4\n" + "ldp q14, q15, [x26]\n" + "add x26, x26, %4\n" + "ldp q24, q25, [x26]\n" + "add x26, x26, %4\n" + "ldp q26, q27, [x26]\n" + "add x26, x26, %4\n" + "ldp q28, q29, [x26]\n" + "add x26, x26, %4\n" + "ldp q30, q31, [x26]\n" + + // K- > x26 + "mov x26, %3\n" + + // Computation loop + "0:\n" + + "smull v16.8h, v0.8b, v6.8b\n" + "ld1r {v0.8b}, [x3]!\n" + "smull v17.8h, v1.8b, v6.8b\n" + "ld1r {v1.8b}, [x3]!\n" + "smull v18.8h, v2.8b, v6.8b\n" + "ld1r {v2.8b}, [x3]!\n" + "smull v19.8h, v3.8b, v6.8b\n" + "ld1r {v3.8b}, [x3]!\n" + "smull v20.8h, v0.8b, v6.8b\n" + "ld1r {v0.8b}, [x3]!\n" + "smull v21.8h, v1.8b, v6.8b\n" + "ld1r {v1.8b}, [x3]!\n" + "smull v22.8h, v2.8b, v6.8b\n" + "ld1r {v2.8b}, [x3]!\n" + "smull v23.8h, v3.8b, v6.8b\n" + "ld1r {v3.8b}, [x3]!\n" + "ldr d6, [x0]!\n" + + "smlal v16.8h, v0.8b, v7.8b\n" + "ld1r {v0.8b}, [x3]!\n" + "smlal v17.8h, v1.8b, v7.8b\n" + "ld1r {v1.8b}, [x3]!\n" + "smlal v18.8h, v2.8b, v7.8b\n" + "ld1r {v2.8b}, [x3]!\n" + "smlal v19.8h, v3.8b, v7.8b\n" + "ld1r {v3.8b}, [x3]!\n" + "smlal v20.8h, v0.8b, v7.8b\n" + "ld1r {v0.8b}, [x3]!\n" + "smlal v21.8h, v1.8b, v7.8b\n" + "ld1r {v1.8b}, [x3]!\n" + "smlal v22.8h, v2.8b, v7.8b\n" + "ld1r {v2.8b}, [x3]!\n" + "smlal v23.8h, v3.8b, v7.8b\n" + "ld1r {v3.8b}, [x3]!\n" + "ldr d7, [x0]!\n" + + "smlal v16.8h, v0.8b, v6.8b\n" + "ld1r {v0.8b}, [x3]!\n" + "smlal v17.8h, v1.8b, v6.8b\n" + "ld1r {v1.8b}, [x3]!\n" + "smlal v18.8h, v2.8b, v6.8b\n" + "ld1r {v2.8b}, [x3]!\n" + "smlal v19.8h, v3.8b, v6.8b\n" + "ld1r {v3.8b}, [x3]!\n" + "smlal v20.8h, v4.8b, v6.8b\n" + "ld1r {v0.8b}, [x3]!\n" + "smlal v21.8h, v1.8b, v6.8b\n" + "ld1r {v1.8b}, [x3]!\n" + "smlal v22.8h, v2.8b, v6.8b\n" + "ld1r {v2.8b}, [x3]!\n" + "smlal v23.8h, v3.8b, v6.8b\n" + "ld1r {v3.8b}, [x3]!\n" + "ldr d6, [x0]!\n" + + "smlal v16.8h, v0.8b, v7.8b\n" + "ld1r {v0.8b}, [x3]!\n" + "smlal v17.8h, v1.8b, v7.8b\n" + "ld1r {v1.8b}, [x3]!\n" + "smlal v18.8h, v2.8b, v7.8b\n" + "ld1r {v2.8b}, [x3]!\n" + "smlal v19.8h, v3.8b, v7.8b\n" + "ld1r {v3.8b}, [x3]!\n" + "smlal v20.8h, v0.8b, v7.8b\n" + "ld1r {v0.8b}, [x3]!\n" + "smlal v21.8h, v1.8b, v7.8b\n" + "ld1r {v1.8b}, [x3]!\n" + "smlal v22.8h, v2.8b, v7.8b\n" + "ld1r {v2.8b}, [x3]!\n" + "smlal v23.8h, v3.8b, v7.8b\n" + "ld1r {v3.8b}, [x3]!\n" + "ldr d7, [x0]!\n" + + "subs x26, x26, #4\n" + + "saddw v8.4s, v8.4s, v16.4h\n" + "saddw2 v9.4s, v9.4s, v16.8h\n" + "saddw v10.4s, v10.4s, v17.4h\n" + "saddw2 v11.4s, v11.4s, v17.8h\n" + "saddw v12.4s, v12.4s, v18.4h\n" + "saddw2 v13.4s, v13.4s, v18.8h\n" + "saddw v14.4s, v14.4s, v19.4h\n" + "saddw2 v15.4s, v15.4s, v19.8h\n" + "saddw v24.4s, v24.4s, v20.4h\n" + "saddw2 v25.4s, v25.4s, v20.8h\n" + "saddw v26.4s, v26.4s, v21.4h\n" + "saddw2 v27.4s, v27.4s, v21.8h\n" + "saddw v28.4s, v28.4s, v22.4h\n" + "saddw2 v29.4s, v29.4s, v22.8h\n" + "saddw v30.4s, v30.4s, v23.4h\n" + "saddw2 v31.4s, v31.4s, v23.8h\n" + + "bne 0b\n" + + // give out address to x26 + "mov x26, %2\n" + + "stp q8, q9, [x26]\n" + "add x26, x26, %4\n" + "stp q10, q11, [x26]\n" + "add x26, x26, %4\n" + "stp q12, q13, [x26]\n" + "add x26, x26, %4\n" + "stp q14, q15, [x26]\n" + "add x26, x26, %4\n" + "stp q24, q25, [x26]\n" + "add x26, x26, %4\n" + "stp q26, q27, [x26]\n" + "add x26, x26, %4\n" + "stp q28, q29, [x26]\n" + "add x26, x26, %4\n" + "stp q30, q31, [x26]\n" + : "+r"(in), "+r"(w), "+r"(out) + : "r"((I64)K), "r"((I64)offset) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", + "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", + "v30", "v31", "x26"); +#endif +} + +inline void mmm_NTail_M8(U32 M, U32 N, U32 K, INT8 *matrix1, INT8 *matrix2, I32 *result) +{ + for (U32 i = 0; i < N; i++) { + int32x4_t res1 = vld1q_s32(result + i * M); + int32x4_t res2 = vld1q_s32(result + i * M + 4); + for (U32 q = 0; q < K; q += 1) { + int8x8_t mat2 = vld1_s8(matrix2 + q * 8); + int8x8_t mat1 = vdup_n_s8(matrix1[q * N + i]); + int16x8_t r = vmull_s8(mat1, mat2); + res1 = vaddw_s16(res1, vget_low_s16(r)); + res2 = vaddw_s16(res2, vget_high_s16(r)); + } + vst1q_s32(result + i * M, res1); + vst1q_s32(result + i * M + 4, res2); + } +} + +inline void mmm_NTail_M(U32 MInner, U32 M, U32 N, U32 K, INT8 *matrix1, INT8 *matrix2, I32 *result) +{ + for (U32 i = 0; i < N; i++) { + for (U32 j = 0; j < MInner; j++) { + for (U32 k = 0; k < K; k++) { + result[i * M + j] += ((I32)matrix1[k * N + i]) * matrix2[k * MInner + j]; + } + } + } +} + +inline void mmm_N4_MTail(U32 MInner, U32 M, U32 K, INT8 *matrix1, INT8 *matrix2, I32 *result) +{ + const int unroll = 4; + int32x4_t res[tileN][2] = {0}; + for (U32 k = 0; k < K; k += unroll) { + int16x8_t res_s16[tileN] = {0}; + for (U32 kk = 0; kk < unroll; kk++) { + U32 z = k + kk; + int8x8_t mat2 = vld1_s8(matrix2 + z * MInner); + for (int i = 0; i < tileN; i++) { + int8x8_t mat10 = vdup_n_s8(matrix1[z * tileN + i]); + res_s16[i] = vmlal_s8(res_s16[i], mat10, mat2); + } + } + for (int i = 0; i < tileN; i++) { + res[i][0] = vaddw_s16(res[i][0], vget_low_s16(res_s16[i])); + res[i][1] = vaddw_s16(res[i][1], vget_high_s16(res_s16[i])); + } + } + int tmp[8]; + for (int i = 0; i < tileN; i++) { + vst1q_s32(tmp, res[i][0]); + vst1q_s32(tmp + 4, res[i][1]); + for (U32 p = 0; p < MInner; p++) { + result[i * M + p] += tmp[p]; + } + } +} + +EE mmm_int8( + int M, int N, int K, bool transposeA, INT8 *matrix1, INT8 *matrix2, INT8 *tmp, I32 *result, Arch arch) +{ + int blockK = K; + U32 K4 = pad_to_4_multiple(K); + int blockM = 96; + for (int k = 0; k < K; k += blockK) { + int KInner = UNI_MIN(blockK, K - k); +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (int n = 0; n <= N - tileN; n += tileN) { + INT8 *matrix1Trans = tmp + n * K4; + if (transposeA) { + matrix2_trans_int8(tileN, KInner, N, matrix1 + n, matrix1Trans); + } else { + matrix1_trans_int8(tileN, KInner, K, matrix1 + n * K + k, matrix1Trans); + } + } + int n = N / tileN * tileN; + if (N - n > 0) { + INT8 *matrix1Trans = tmp + n * K4; + if (transposeA) { + matrix2_trans_int8(N - n, KInner, N, matrix1 + n, matrix1Trans); + } else { + matrix1_trans_int8(N - n, KInner, K, matrix1 + n * K + k, matrix1Trans); + } + } + +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (int i = 0; i < M; i += blockM) { + int MInner = UNI_MIN(blockM, M - i); + I32 *resultCurrent; + int m, n; + for (n = 0; n <= N - tileN; n += tileN) { + INT8 *matrix1Trans = tmp + n * K4; + //if (i == 0) { + // if (transposeA) { + // matrix2_trans_int8(4, KInner, N, matrix1 + n, matrix1Trans + n * K4); + // } else { + // matrix1_trans_int8(4, KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); + // } + //} + for (m = 0; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_4x8(M, K4, matrix1Trans, matrix2 + (i + m) * K4, resultCurrent); + //mmm_NTail_M(8, M, 4, K4, matrix1Trans, matrix2 + (i + m) * K4, resultCurrent); + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_N4_MTail( + MInner - m, M, K4, matrix1Trans, matrix2 + (i + m) * K4, resultCurrent); + //mmm_NTail_M(MInner - m, M, 4, K4, matrix1Trans, matrix2 + (i + m) * K4, + // resultCurrent); + } + } + + if (N - n) { + INT8 *matrix1Trans = tmp + n * K4; + //if (i == 0) { + // if (transposeA) { + // matrix2_trans_int8(N - n, KInner, N, matrix1 + n, matrix1Trans + n * K4); + // } else { + // matrix1_trans_int8( + // N - n, KInner, K, matrix1 + n * K + k, matrix1Trans + n * K4); + // } + //} + + for (m = 0; m <= (MInner - 8); m += 8) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M8( + M, N - n, KInner, matrix1Trans, matrix2 + (i + m) * K4, resultCurrent); + //mmm_NTail_M(8, M, N - n, K4, matrix1Trans, + // matrix2 + (i + m) * K4, resultCurrent); + } + + if (MInner - m) { + resultCurrent = result + n * M + m + i; + mmm_NTail_M(MInner - m, M, N - n, K4, matrix1Trans, matrix2 + (i + m) * K4, + resultCurrent); + } + } + } + } + return SUCCESS; +} diff --git a/compute/blas_enhance/src/cpu/x86/axpby.cpp b/compute/blas_enhance/src/cpu/x86/axpby.cpp new file mode 100644 index 00000000..b105d41a --- /dev/null +++ b/compute/blas_enhance/src/cpu/x86/axpby.cpp @@ -0,0 +1,32 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/blas_x86.h" +#ifdef _USE_FP32 +#include "cpu/x86/fp32/blas_fp32.h" +#endif + +EE axpby_x86(U32 len, DataType dt, F32 a, const void *x, F32 b, void *y) +{ + EE ret = NOT_SUPPORTED; + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + ret = axpby_fp32(len, a, (F32 *)x, b, (F32 *)y); + break; +#endif + default: + break; + } + return ret; +} diff --git a/compute/blas_enhance/src/cpu/x86/blas_x86.h b/compute/blas_enhance/src/cpu/x86/blas_x86.h index 997b21fe..c667c99b 100644 --- a/compute/blas_enhance/src/cpu/x86/blas_x86.h +++ b/compute/blas_enhance/src/cpu/x86/blas_x86.h @@ -15,9 +15,10 @@ #define _H_BLAS_X86 #include "error.h" -#include "sys.h" #include "tensor_desc.h" +EE axpby_x86(U32 len, DataType dt, F32 a, const void *x, F32 b, void *y); + EE matrix_vector_multiply_transform_weight_x86( TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, void *offsetCBias); @@ -34,10 +35,10 @@ EE mvm_x86(U32 row, const F32 *scale); EE matrix_matrix_multiply_tmp_bytes_x86( - U32 matrixA_M, U32 matrixA_K, U32 matrixB_K, U32 matrixB_N, DataType dt, U32 *bytes); + U32 matrixA_M, U32 matrixA_K, U32 matrixB_K, U32 matrixB_N, DataFormat df, DataType dt, U32 *bytes); EE matrix_matrix_multiply_transform_rhs_x86( - TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, void *offsetCBias); + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst); EE mmm_x86(U32 matrixC_N, U32 matrixC_M, diff --git a/compute/blas_enhance/src/cpu/x86/fp32/axpby.cpp b/compute/blas_enhance/src/cpu/x86/fp32/axpby.cpp new file mode 100644 index 00000000..c96b33aa --- /dev/null +++ b/compute/blas_enhance/src/cpu/x86/fp32/axpby.cpp @@ -0,0 +1,32 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/blas_fp32.h" + +EE axpby_fp32(U32 len, F32 a, const F32 *x, F32 b, F32 *y) +{ + __m256 alpha = _mm256_set1_ps(a); + __m256 beta = _mm256_set1_ps(b); + I32 i = 0; + for (; i < ((I32)len) - 7; i += 8) { + __m256 in = _mm256_loadu_ps(x + i); + __m256 out = _mm256_loadu_ps(y + i); + out = _mm256_mul_ps(out, beta); + out = _mm256_fmadd_ps(alpha, in, out); + _mm256_storeu_ps(y + i, out); + } + for (; i < (I32)len; i++) { + y[i] = a * x[i] + b * y[i]; + } + return SUCCESS; +} diff --git a/compute/blas_enhance/src/cpu/x86/fp32/blas_fp32.h b/compute/blas_enhance/src/cpu/x86/fp32/blas_fp32.h index 02d9ab5e..2cdaece0 100644 --- a/compute/blas_enhance/src/cpu/x86/fp32/blas_fp32.h +++ b/compute/blas_enhance/src/cpu/x86/fp32/blas_fp32.h @@ -14,13 +14,12 @@ #ifndef _H_BLAS_FP32 #define _H_BLAS_FP32 -#include "sys.h" - -#include "error.h" #include "tensor_desc.h" #include "thread_affinity.h" #include "uni.h" +EE axpby_fp32(U32 len, F32 a, const F32 *x, F32 b, F32 *y); + void mvm_col_fp32(U32 row, U32 col, F32 *matrix, F32 *vector, F32 *result); void mvm_row_fp32(U32 row, U32 col, F32 *matrix, F32 *vector, F32 *result); @@ -68,14 +67,15 @@ EE mmm_avx2_fp32(int M, F32 *tmp, F32 *result); -inline void matrix1_trans(U32 size, U32 blockK, U32 K, F32 *src, F32 *dst) +inline void matrix1_trans_w(U32 size, U32 realSize, U32 blockK, U32 K, F32 *src, F32 *dst) { - U32 remain = size % 4; - size = size / 4 * 4; + U32 remain = realSize % 4; + U32 mainSize = realSize / 4 * 4; __m128i vindex = _mm_set_epi32(K * 3, K * 2, K, 0); + F32 *rdst = dst; for (U32 i = 0; i < blockK; ++i) { U32 j; - for (j = 0; j < size; j += 4) { + for (j = 0; j < mainSize; j += 4) { if (i % 16 == 0) { _mm_prefetch(src + i + j * K + 16, _MM_HINT_NTA); _mm_prefetch(src + i + (j + 1) * K + 16, _MM_HINT_NTA); @@ -85,7 +85,49 @@ inline void matrix1_trans(U32 size, U32 blockK, U32 K, F32 *src, F32 *dst) _mm_store_ps(dst, _mm_i32gather_ps(src + i + j * K, vindex, 4)); dst += 4; } - for (; j < remain; ++j) { + for (; j < realSize; ++j) { + if (i % 16 == 0) { + _mm_prefetch(src + i + (j + mainSize) * K + 16, _MM_HINT_NTA); + } + *(dst++) = *(src + i + j * K); + } + + for (; j < size; ++j) { + *(dst++) = 0; + } + } +} + +inline void matrix2_trans_w(U32 size, U32 realSize, U32 blockK, U32 M, F32 *src, F32 *dst) +{ + for (U32 i = 0; i < blockK; i++) { + for (U32 j = 0; j < size; j += 16) { + _mm_prefetch(src + M + j, _MM_HINT_NTA); + } + UNI_MEMCPY(dst, src, realSize * sizeof(F32)); + dst += size; + src += M; + } +} + +inline void matrix1_trans(U32 size, U32 blockK, U32 K, F32 *src, F32 *dst) +{ + U32 remain = size % 8; + size = size / 8 * 8; + __m256i vindex = _mm256_set_epi32(K * 7, K * 6, K * 5, K * 4, K * 3, K * 2, K, 0); + for (U32 i = 0; i < blockK; ++i) { + U32 j; + for (j = 0; j < size; j += 8) { + if (i % 16 == 0) { + _mm_prefetch(src + i + j * K + 16, _MM_HINT_NTA); + _mm_prefetch(src + i + (j + 1) * K + 16, _MM_HINT_NTA); + _mm_prefetch(src + i + (j + 2) * K + 16, _MM_HINT_NTA); + _mm_prefetch(src + i + (j + 3) * K + 16, _MM_HINT_NTA); + } + _mm256_storeu_ps(dst, _mm256_i32gather_ps(src + i + j * K, vindex, 4)); + dst += 8; + } + for (; j < (remain + size); ++j) { if (i % 16 == 0) { _mm_prefetch(src + i + (j + size) * K + 16, _MM_HINT_NTA); } @@ -100,7 +142,7 @@ inline void matrix2_trans(U32 size, U32 blockK, U32 M, F32 *src, F32 *dst) for (U32 j = 0; j < size; j += 16) { _mm_prefetch(src + M + j, _MM_HINT_NTA); } - memcpy(dst, src, size * sizeof(F32)); + UNI_MEMCPY(dst, src, size * sizeof(F32)); dst += size; src += M; } diff --git a/compute/blas_enhance/src/cpu/x86/fp32/mmm_avx2.cpp b/compute/blas_enhance/src/cpu/x86/fp32/mmm_avx2.cpp index d1758191..6ad2c4e0 100644 --- a/compute/blas_enhance/src/cpu/x86/fp32/mmm_avx2.cpp +++ b/compute/blas_enhance/src/cpu/x86/fp32/mmm_avx2.cpp @@ -17,17 +17,576 @@ #define UNROLL_K 4 #define UNROLL_N 24 #define UNROLL_M 4 -#define BOLCK_M_DIM 768 -#define BOLCK_K_DIM 768 +#define BOLCK_M_DIM 1024 +#define BOLCK_K_DIM 1024 #define align_addr(addr, unit) (((uintptr_t)addr + unit - 1) / unit * unit) -typedef void (*kernel_func)( - U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 ldc); +typedef void (*kernel_func)(U32 um, + U32 un, + U32 bk, + F32 *matrixA, + F32 *matrixB, + F32 *matrixC, + U32 ldc, + I32 *mask, + F32 *A1, + F32 *A2, + F32 *A3); + +// clang-format off +#define clear1Regs(rtype) \ + "vxorps "#rtype"0, "#rtype"0, "#rtype"0 \n\t" + +#define clear2Regs(rtype) \ + clear1Regs(rtype) \ + "vxorps "#rtype"1, "#rtype"1, "#rtype"1 \n\t" + +#define clear3Regs(rtype) \ + clear2Regs(rtype) \ + "vxorps "#rtype"2, "#rtype"2, "#rtype"2 \n\t" + +#define clear4Regs(rtype) \ + clear3Regs(rtype) \ + "vxorps "#rtype"3, "#rtype"3, "#rtype"3 \n\t" + +#define clear6Regs(rtype) \ + clear4Regs(rtype) \ + "vxorps "#rtype"4, "#rtype"4, "#rtype"4 \n\t" \ + "vxorps "#rtype"5, "#rtype"5, "#rtype"5 \n\t" + +#define clear8Regs(rtype) \ + clear6Regs(rtype) \ + "vxorps "#rtype"6, "#rtype"6, "#rtype"6 \n\t" \ + "vxorps "#rtype"7, "#rtype"7, "#rtype"7 \n\t" + +#define clear9Regs(rtype) \ + clear8Regs(rtype) \ + "vxorps "#rtype"8, "#rtype"8, "#rtype"8 \n\t" + +#define clear12Regs(rtype) \ + clear9Regs(rtype) \ + "vxorps "#rtype"9, "#rtype"9, "#rtype"9 \n\t" \ + "vxorps "#rtype"10, "#rtype"10, "#rtype"10 \n\t" \ + "vxorps "#rtype"11, "#rtype"11, "#rtype"11 \n\t" + +#define asm_1x24_kernel(i0, f0, f1, f2) \ + "vbroadcastss "#i0"(%[A0]), %%ymm15 \n\t" \ + "vmovaps "#f0"(%[B]), %%ymm12 \n\t" \ + "vmovaps "#f1"(%[B]), %%ymm13 \n\t" \ + "vmovaps "#f2"(%[B]), %%ymm14 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + +#define asm_2x24_kernel(i0, f0, f1, f2) \ + asm_1x24_kernel(i0, f0, f1, f2) \ + "vbroadcastss "#i0"(%[A1]), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + +#define asm_3x24_kernel(i0, f0, f1, f2) \ + asm_2x24_kernel(i0, f0, f1, f2) \ + "vbroadcastss "#i0"(%[A2]), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + +#define asm_4x24_kernel(i0, f0, f1, f2) \ + asm_3x24_kernel(i0, f0, f1, f2) \ + "vbroadcastss "#i0"(%[A3]), %%ymm15 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" \ + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + +#define store_1x24_0(N) \ + "vaddps (%[C]), %%ymm0, %%ymm0 \n\t" \ + "vaddps 0x20(%[C]), %%ymm1, %%ymm1 \n\t" \ + "vaddps 0x40(%[C]), %%ymm2, %%ymm2 \n\t" \ + "vmovups %%ymm0, (%[C]) \n\t" \ + "vmovups %%ymm1, 0x20(%[C]) \n\t" \ + "vmovups %%ymm2, 0x40(%[C]) \n\t" + +#define store_2x24_0(N) \ + store_1x24_0(N) \ + "add "#N", %[C] \n\t" \ + "vaddps (%[C]), %%ymm3, %%ymm3 \n\t" \ + "vaddps 0x20(%[C]), %%ymm4, %%ymm4 \n\t" \ + "vaddps 0x40(%[C]), %%ymm5, %%ymm5 \n\t" \ + "vmovups %%ymm3, (%[C]) \n\t" \ + "vmovups %%ymm4, 0x20(%[C]) \n\t" \ + "vmovups %%ymm5, 0x40(%[C]) \n\t" + +#define store_3x24_0(N) \ + store_2x24_0(N) \ + "add "#N", %[C] \n\t" \ + "vaddps (%[C]), %%ymm6, %%ymm6 \n\t" \ + "vaddps 0x20(%[C]), %%ymm7, %%ymm7 \n\t" \ + "vaddps 0x40(%[C]), %%ymm8, %%ymm8 \n\t" \ + "vmovups %%ymm6, (%[C]) \n\t" \ + "vmovups %%ymm7, 0x20(%[C]) \n\t" \ + "vmovups %%ymm8, 0x40(%[C]) \n\t" + +#define store_4x24_0(N) \ + store_3x24_0(N) \ + "add "#N", %[C] \n\t" \ + "vaddps (%[C]), %%ymm9, %%ymm9 \n\t" \ + "vaddps 0x20(%[C]), %%ymm10, %%ymm10 \n\t" \ + "vaddps 0x40(%[C]), %%ymm11, %%ymm11 \n\t" \ + "vmovups %%ymm9, (%[C]) \n\t" \ + "vmovups %%ymm10, 0x20(%[C]) \n\t" \ + "vmovups %%ymm11, 0x40(%[C]) \n\t" + +#define store_1x24_1(N) \ + "vmovups (%[mask]), %%ymm15 \n\t" \ + "vmaskmovps 0x40(%[C]), %%ymm15, %%ymm14 \n\t" \ + "vaddps (%[C]), %%ymm0, %%ymm0 \n\t" \ + "vaddps 0x20(%[C]), %%ymm1, %%ymm1 \n\t" \ + "vaddps %%ymm14, %%ymm2, %%ymm2 \n\t" \ + "vmovups %%ymm0, (%[C]) \n\t" \ + "vmovups %%ymm1, 0x20(%[C]) \n\t" \ + "vmaskmovps %%ymm2, %%ymm15, 0x40(%[C]) \n\t" + +#define store_2x24_1(N) \ + store_1x24_1(N) \ + "add "#N", %[C] \n\t" \ + "vmaskmovps 0x40(%[C]), %%ymm15, %%ymm14 \n\t" \ + "vaddps (%[C]), %%ymm3, %%ymm3 \n\t" \ + "vaddps 0x20(%[C]), %%ymm4, %%ymm4 \n\t" \ + "vaddps %%ymm14, %%ymm5, %%ymm5 \n\t" \ + "vmovups %%ymm3, (%[C]) \n\t" \ + "vmovups %%ymm4, 0x20(%[C]) \n\t" \ + "vmaskmovps %%ymm5, %%ymm15, 0x40(%[C]) \n\t" + +#define store_3x24_1(N) \ + store_2x24_1(N) \ + "add "#N", %[C] \n\t" \ + "vmaskmovps 0x40(%[C]), %%ymm15, %%ymm14 \n\t" \ + "vaddps (%[C]), %%ymm6, %%ymm6 \n\t" \ + "vaddps 0x20(%[C]), %%ymm7, %%ymm7 \n\t" \ + "vaddps %%ymm14, %%ymm8, %%ymm8 \n\t" \ + "vmovups %%ymm6, (%[C]) \n\t" \ + "vmovups %%ymm7, 0x20(%[C]) \n\t" \ + "vmaskmovps %%ymm8, %%ymm15, 0x40(%[C]) \n\t" + +#define store_4x24_1(N) \ + store_3x24_1(N) \ + "add "#N", %[C] \n\t" \ + "vmaskmovps 0x40(%[C]), %%ymm15, %%ymm14 \n\t" \ + "vaddps (%[C]), %%ymm9, %%ymm9 \n\t" \ + "vaddps 0x20(%[C]), %%ymm10, %%ymm10 \n\t" \ + "vaddps %%ymm14, %%ymm11, %%ymm11 \n\t" \ + "vmovups %%ymm9, (%[C]) \n\t" \ + "vmovups %%ymm10, 0x20(%[C]) \n\t" \ + "vmaskmovps %%ymm11, %%ymm15, 0x40(%[C]) \n\t" + + +#define asm_1x16_kernel(i0, f0, f1) \ + "vbroadcastss "#i0"(%[A0]), %%ymm10 \n\t" \ + "vmovaps "#f0"(%[B]), %%ymm8 \n\t" \ + "vmovaps "#f1"(%[B]), %%ymm9 \n\t" \ + "vfmadd231ps %%ymm10, %%ymm8, %%ymm0 \n\t" \ + "vfmadd231ps %%ymm10, %%ymm9, %%ymm1 \n\t" \ + +#define asm_2x16_kernel(i0, f0, f1) \ + asm_1x16_kernel(i0, f0, f1) \ + "vbroadcastss "#i0"(%[A1]), %%ymm10 \n\t" \ + "vfmadd231ps %%ymm10, %%ymm8, %%ymm2 \n\t" \ + "vfmadd231ps %%ymm10, %%ymm9, %%ymm3 \n\t" \ + +#define asm_3x16_kernel(i0, f0, f1) \ + asm_2x16_kernel(i0, f0, f1) \ + "vbroadcastss "#i0"(%[A2]), %%ymm10 \n\t" \ + "vfmadd231ps %%ymm10, %%ymm8, %%ymm4 \n\t" \ + "vfmadd231ps %%ymm10, %%ymm9, %%ymm5 \n\t" \ + +#define asm_4x16_kernel(i0, f0, f1) \ + asm_3x16_kernel(i0, f0, f1) \ + "vbroadcastss "#i0"(%[A3]), %%ymm10 \n\t" \ + "vfmadd231ps %%ymm10, %%ymm8, %%ymm6 \n\t" \ + "vfmadd231ps %%ymm10, %%ymm9, %%ymm7 \n\t" \ + +#define store_1x16_0(N) \ + "vaddps (%[C]), %%ymm0, %%ymm0 \n\t" \ + "vaddps 0x20(%[C]), %%ymm1, %%ymm1 \n\t" \ + "vmovups %%ymm0, (%[C]) \n\t" \ + "vmovups %%ymm1, 0x20(%[C]) \n\t" \ + +#define store_2x16_0(N) \ + store_1x16_0(N) \ + "add "#N", %[C] \n\t" \ + "vaddps (%[C]), %%ymm2, %%ymm2 \n\t" \ + "vaddps 0x20(%[C]), %%ymm3, %%ymm3 \n\t" \ + "vmovups %%ymm2, (%[C]) \n\t" \ + "vmovups %%ymm3, 0x20(%[C]) \n\t" \ + +#define store_3x16_0(N) \ + store_2x16_0(N) \ + "add "#N", %[C] \n\t" \ + "vaddps (%[C]), %%ymm4, %%ymm4 \n\t" \ + "vaddps 0x20(%[C]), %%ymm5, %%ymm5 \n\t" \ + "vmovups %%ymm4, (%[C]) \n\t" \ + "vmovups %%ymm5, 0x20(%[C]) \n\t" \ + +#define store_4x16_0(N) \ + store_3x16_0(N) \ + "add "#N", %[C] \n\t" \ + "vaddps (%[C]), %%ymm6, %%ymm6 \n\t" \ + "vaddps 0x20(%[C]), %%ymm7, %%ymm7 \n\t" \ + "vmovups %%ymm6, (%[C]) \n\t" \ + "vmovups %%ymm7, 0x20(%[C]) \n\t" \ + +#define store_1x16_1(N) \ + "vmovups (%[mask]), %%ymm10 \n\t" \ + "vmaskmovps 0x20(%[C]), %%ymm10, %%ymm9 \n\t" \ + "vaddps (%[C]), %%ymm0, %%ymm0 \n\t" \ + "vaddps %%ymm9, %%ymm1, %%ymm1 \n\t" \ + "vmovups %%ymm0, (%[C]) \n\t" \ + "vmaskmovps %%ymm1, %%ymm10, 0x20(%[C]) \n\t" \ + +#define store_2x16_1(N) \ + store_1x16_1(N) \ + "add "#N", %[C] \n\t" \ + "vmaskmovps 0x20(%[C]), %%ymm10, %%ymm9 \n\t" \ + "vaddps (%[C]), %%ymm2, %%ymm2 \n\t" \ + "vaddps %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vmovups %%ymm2, (%[C]) \n\t" \ + "vmaskmovps %%ymm3, %%ymm10, 0x20(%[C]) \n\t" \ + +#define store_3x16_1(N) \ + store_2x16_1(N) \ + "add "#N", %[C] \n\t" \ + "vmaskmovps 0x20(%[C]), %%ymm10, %%ymm9 \n\t" \ + "vaddps (%[C]), %%ymm4, %%ymm4 \n\t" \ + "vaddps %%ymm9, %%ymm5, %%ymm5 \n\t" \ + "vmovups %%ymm4, (%[C]) \n\t" \ + "vmaskmovps %%ymm5, %%ymm10, 0x20(%[C]) \n\t" \ + +#define store_4x16_1(N) \ + store_3x16_1(N) \ + "add "#N", %[C] \n\t" \ + "vmaskmovps 0x20(%[C]), %%ymm10, %%ymm9 \n\t" \ + "vaddps (%[C]), %%ymm6, %%ymm6 \n\t" \ + "vaddps %%ymm9, %%ymm7, %%ymm7 \n\t" \ + "vmovups %%ymm6, (%[C]) \n\t" \ + "vmaskmovps %%ymm7, %%ymm10, 0x20(%[C]) \n\t" \ + + +#define asm_1x8_kernel(i0, f0, rtype) \ + "vmovaps "#f0"(%[B]), "#rtype"4 \n\t" \ + "vbroadcastss "#i0"(%[A0]), "#rtype"5 \n\t" \ + "vfmadd231ps "#rtype"5, "#rtype"4, "#rtype"0 \n\t" + +#define asm_2x8_kernel(i0, f0, rtype) \ + asm_1x8_kernel(i0, f0, rtype) \ + "vbroadcastss "#i0"(%[A1]), "#rtype"5 \n\t" \ + "vfmadd231ps "#rtype"5, "#rtype"4, "#rtype"1 \n\t" + +#define asm_3x8_kernel(i0, f0, rtype) \ + asm_2x8_kernel(i0, f0, rtype) \ + "vbroadcastss "#i0"(%[A2]), "#rtype"5 \n\t" \ + "vfmadd231ps "#rtype"5, "#rtype"4, "#rtype"2 \n\t" + +#define asm_4x8_kernel(i0, f0, rtype) \ + asm_3x8_kernel(i0, f0, rtype) \ + "vbroadcastss "#i0"(%[A3]), "#rtype"5 \n\t" \ + "vfmadd231ps "#rtype"5, "#rtype"4, "#rtype"3 \n\t" + +#define store_1x8_0(N, rtype) \ + "vaddps (%[C]), "#rtype"0, "#rtype"0 \n\t" \ + "vmovups "#rtype"0, (%[C]) \n\t" + +#define store_2x8_0(N, rtype) \ + store_1x8_0(N, rtype) \ + "add "#N", %[C] \n\t" \ + "vaddps (%[C]), "#rtype"1, "#rtype"1 \n\t" \ + "vmovups "#rtype"1, (%[C]) \n\t" + +#define store_3x8_0(N, rtype) \ + store_2x8_0(N, rtype) \ + "add "#N", %[C] \n\t" \ + "vaddps (%[C]), "#rtype"2, "#rtype"2 \n\t" \ + "vmovups "#rtype"2, (%[C]) \n\t" + +#define store_4x8_0(N, rtype) \ + store_3x8_0(N, rtype) \ + "add "#N", %[C] \n\t" \ + "vaddps (%[C]), "#rtype"3, "#rtype"3 \n\t" \ + "vmovups "#rtype"3, (%[C]) \n\t" + +#define store_1x8_1(N, rtype) \ + "vmovups (%[mask]), "#rtype"5 \n\t" \ + "vmaskmovps (%[C]), "#rtype"5, "#rtype"4 \n\t" \ + "vaddps "#rtype"4, "#rtype"0, "#rtype"0 \n\t" \ + "vmaskmovps "#rtype"0, "#rtype"5, (%[C]) \n\t" + +#define store_2x8_1(N, rtype) \ + store_1x8_1(N, rtype) \ + "add "#N", %[C] \n\t" \ + "vmaskmovps (%[C]), "#rtype"5, "#rtype"4 \n\t" \ + "vaddps "#rtype"4, "#rtype"1, "#rtype"1 \n\t" \ + "vmaskmovps "#rtype"1, "#rtype"5, (%[C]) \n\t" + +#define store_3x8_1(N, rtype) \ + store_2x8_1(N, rtype) \ + "add "#N", %[C] \n\t" \ + "vmaskmovps (%[C]), "#rtype"5, "#rtype"4 \n\t" \ + "vaddps "#rtype"4, "#rtype"2, "#rtype"2 \n\t" \ + "vmaskmovps "#rtype"2, "#rtype"5, (%[C]) \n\t" + +#define store_4x8_1(N, rtype) \ + store_3x8_1(N, rtype) \ + "add "#N", %[C] \n\t" \ + "vmaskmovps (%[C]), "#rtype"5, "#rtype"4 \n\t" \ + "vaddps "#rtype"4, "#rtype"3, "#rtype"3 \n\t" \ + "vmaskmovps "#rtype"3, "#rtype"5, (%[C]) \n\t" + +#define kernel_24_4_loop(m) \ + "prefetcht0 0x140(%[B]) \n\t" \ + "prefetcht0 0x180(%[B]) \n\t" \ + asm_##m##x24_kernel(0x0, 0x0, 0x20, 0x40) \ + "prefetcht0 0x1C0(%[B]) \n\t" \ + asm_##m##x24_kernel(0x4, 0x60, 0x80, 0xA0) \ + "prefetcht0 0x200(%[B]) \n\t" \ + "prefetcht0 0x240(%[B]) \n\t" \ + asm_##m##x24_kernel(0x8, 0xC0, 0xE0, 0x100) \ + "prefetcht0 0x280(%[B]) \n\t" \ + asm_##m##x24_kernel(0xC, 0x120, 0x140, 0x160) \ + "add $0x180, %[B] \n\t" + +#define kernel_16_4_loop(m) \ + "prefetcht0 0x140(%1) \n\t" \ + asm_##m##x16_kernel(0x0, 0x0, 0x20) \ + "prefetcht0 0x180(%1) \n\t" \ + asm_##m##x16_kernel(0x4, 0x40, 0x60) \ + "prefetcht0 0x1C0(%1) \n\t" \ + asm_##m##x16_kernel(0x8, 0x80, 0xA0) \ + "prefetcht0 0x200(%1) \n\t" \ + asm_##m##x16_kernel(0xC, 0xC0, 0xE0) \ + "add $0x100, %[B] \n\t" + +#define kernel_8_4_loop(m) \ + asm_##m##x8_kernel(0x0, 0x0, %%ymm) \ + asm_##m##x8_kernel(0x4, 0x20, %%ymm) \ + asm_##m##x8_kernel(0x8, 0x40, %%ymm) \ + asm_##m##x8_kernel(0xC, 0x60, %%ymm) \ + "add $0x80, %[B] \n\t" + +#define kernel_4_4_loop(m) \ + asm_##m##x8_kernel(0x0, 0x0, %%xmm) \ + asm_##m##x8_kernel(0x4, 0x10, %%xmm) \ + asm_##m##x8_kernel(0x8, 0x20, %%xmm) \ + asm_##m##x8_kernel(0xC, 0x30, %%xmm) \ + "add $0x40, %[B] \n\t" + +#define m_24_kernel(m, x, edge) \ + __asm__ __volatile__(clear##x##Regs(%%ymm) \ + "mov %[bk], %%ecx \n\t" \ + "shr $2, %%ecx \n\t" \ + "je 1f \n\t" \ + ".align 16 \n\t" \ + "0: \n\t" \ + kernel_24_4_loop(m) \ + "add $0x10, %[A0] \n\t" \ + "add $0x10, %[A1] \n\t" \ + "add $0x10, %[A2] \n\t" \ + "add $0x10, %[A3] \n\t" \ + "dec %%ecx \n\t" \ + "jg 0b \n\t" \ + ".align 16 \n\t" \ + "1: \n\t" \ + "mov %[bk], %%ecx \n\t" \ + "and $3, %%ecx \n\t" \ + "je 3f \n\t" \ + ".align 16 \n\t" \ + "2: \n\t" \ + asm_##m##x24_kernel(0x0, 0x0, 0x20, 0x40) \ + "add $0x60, %[B] \n\t" \ + "add $0x4, %[A0] \n\t" \ + "add $0x4, %[A1] \n\t" \ + "add $0x4, %[A2] \n\t" \ + "add $0x4, %[A3] \n\t" \ + "dec %%ecx \n\t" \ + "jg 2b \n\t" \ + "3: \n\t" \ + "shl $2, %%rax \n\t" \ + store_##m##x24_##edge(%%rax) \ + : [B] "+r" (matrixB), \ + [A0] "+r" (matrixA), \ + [A1] "+r" (A1), \ + [A2] "+r" (A2), \ + [A3] "+r" (A3), \ + [C] "+r" (matrixC) \ + : "a"((I64)N), \ + [bk] "r" (bk), \ + [mask] "r" (mask) \ + : "%ecx", \ + "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", \ + "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", \ + "%ymm10", "%ymm11", "%ymm12", "%ymm13", \ + "%ymm14", "%ymm15", "memory"); + +#define m_16_kernel(m, x, edge) \ + __asm__ __volatile__(clear##x##Regs(%%ymm) \ + "mov %[bk], %%ecx \n\t" \ + "shr $2, %%ecx \n\t" \ + "je 1f \n\t" \ + ".align 16 \n\t" \ + "0: \n\t" \ + kernel_16_4_loop(m) \ + "add $0x10, %[A0] \n\t" \ + "add $0x10, %[A1] \n\t" \ + "add $0x10, %[A2] \n\t" \ + "add $0x10, %[A3] \n\t" \ + "dec %%ecx \n\t" \ + "jg 0b \n\t" \ + ".align 16 \n\t" \ + "1: \n\t" \ + "mov %[bk], %%ecx \n\t" \ + "and $3, %%ecx \n\t" \ + "je 3f \n\t" \ + ".align 16 \n\t" \ + "2: \n\t" \ + asm_##m##x16_kernel(0x0, 0x0, 0x20) \ + "add $0x40, %[B] \n\t" \ + "add $0x4, %[A0] \n\t" \ + "add $0x4, %[A1] \n\t" \ + "add $0x4, %[A2] \n\t" \ + "add $0x4, %[A3] \n\t" \ + "dec %%ecx \n\t" \ + "jg 2b \n\t" \ + "3: \n\t" \ + "shl $2, %%rax \n\t" \ + store_##m##x16_##edge(%%rax) \ + : [B] "+r" (matrixB), \ + [A0] "+r" (matrixA), \ + [A1] "+r" (A1), \ + [A2] "+r" (A2), \ + [A3] "+r" (A3), \ + [C] "+r" (matrixC) \ + : "a"((I64)N), \ + [bk] "r" (bk), \ + [mask] "r" (mask) \ + : "%ecx", \ + "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", \ + "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", \ + "%ymm10", "memory"); + +#define asm_4_kernel(m) \ + asm_##m##x8_kernel(0x0, 0x0, %%xmm) \ + "add $0x10, %[B] \n\t" + +#define asm_8_kernel(m) \ + asm_##m##x8_kernel(0x0, 0x0, %%ymm) \ + "add $0x20, %[B] \n\t" + +#define m_8_kernel_wrap(m, n, x, rtype, edge) \ + __asm__ __volatile__(clear##x##Regs(rtype) \ + "mov %[bk], %%ecx \n\t" \ + "shr $2, %%ecx \n\t" \ + "je 1f \n\t" \ + ".align 16 \n\t" \ + "0: \n\t" \ + kernel_##n##_4_loop(m) \ + "add $0x10, %[A0] \n\t" \ + "add $0x10, %[A1] \n\t" \ + "add $0x10, %[A2] \n\t" \ + "add $0x10, %[A3] \n\t" \ + "dec %%ecx \n\t" \ + "jg 0b \n\t" \ + ".align 16 \n\t" \ + "1: \n\t" \ + "mov %[bk], %%ecx \n\t" \ + "and $3, %%ecx \n\t" \ + "je 3f \n\t" \ + ".align 16 \n\t" \ + "2: \n\t" \ + asm_##n##_kernel(m) \ + "add $0x4, %[A0] \n\t" \ + "add $0x4, %[A1] \n\t" \ + "add $0x4, %[A2] \n\t" \ + "add $0x4, %[A3] \n\t" \ + "dec %%ecx \n\t" \ + "jg 2b \n\t" \ + "3: \n\t" \ + "shl $2, %%rax \n\t" \ + store_##m##x8_##edge(%%rax, rtype) \ + : [B] "+r" (matrixB), \ + [A0] "+r" (matrixA), \ + [A1] "+r" (A1), \ + [A2] "+r" (A2), \ + [A3] "+r" (A3), \ + [C] "+r" (matrixC) \ + : "a"((I64)N), \ + [bk] "r" (bk), \ + [mask] "r" (mask) \ + : "%ecx", \ + "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", \ + "%ymm5", "memory"); + +#define m_8_kernel(m, x, edge) \ + m_8_kernel_wrap(m, 8, x, %%ymm, edge) + +#define m_4_kernel(m, x, edge) \ + m_8_kernel_wrap(m, 4, x, %%xmm, edge) + +#define mmm_mxn_asm(m, n, regNum) \ + void mmm_avx2_##m##x##n##_asm( \ + U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, \ + F32 *matrixC, U32 N, I32 *mask, F32 *A1, F32 *A2, F32 *A3) \ +{ \ + if (mask == nullptr) { \ + m_##n##_kernel(m, regNum, 0) \ + } else { \ + m_##n##_kernel(m, regNum, 1) \ + } \ +} + +mmm_mxn_asm(4, 24, 12) +mmm_mxn_asm(3, 24, 9) +mmm_mxn_asm(2, 24, 6) +mmm_mxn_asm(1, 24, 3) +mmm_mxn_asm(4, 16, 8) +mmm_mxn_asm(3, 16, 6) +mmm_mxn_asm(2, 16, 4) +mmm_mxn_asm(1, 16, 2) +mmm_mxn_asm(4, 8, 4) +mmm_mxn_asm(3, 8, 3) +mmm_mxn_asm(2, 8, 2) +mmm_mxn_asm(1, 8, 1) +mmm_mxn_asm(4, 4, 4) +mmm_mxn_asm(3, 4, 3) +mmm_mxn_asm(2, 4, 2) +mmm_mxn_asm(1, 4, 1) + +// clang-format on + +void mmm_avx2_n_mtail(U32 um, + U32 un, + U32 bk, + F32 *matrixA, + F32 *matrixB, + F32 *matrixC, + U32 N, + I32 *mask, + F32 *A1, + F32 *A2, + F32 *A3) +{ + F32 *ar[4] = {matrixA, A1, A2, A3}; + for (U32 i = 0; i < um; ++i) { + for (U32 j = 0; j < un; ++j) { + for (U32 k = 0; k < bk; ++k) { + matrixC[i * N + j] += ar[i][k] * matrixB[k * un + j]; + } + } + } +} void matrix_matrix_multiply_tmp_bytes_fp32( U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes) { - *bytes = row1 * col1 + row2 * col2; + *bytes = row1 * col1 + (col2 + 7) / 8 * 8 * row2; *bytes *= sizeof(dt); *bytes += 32; } @@ -39,15 +598,18 @@ EE matrix_matrix_multiply_transform_rhsN_fp32(TensorDesc desc, F32 *src, F32 *ds U32 N, K, blockSizeK, unrollSizeN; CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); F32 unrollSize[4] = {4, 8, 16, 24}; + U32 resN = N % UNROLL_N; + U32 edgeBlockNSizeIdx = (resN > 4) ? ((resN + 7) / 8) : 0; + U32 edgeBlockNSize = unrollSize[edgeBlockNSizeIdx]; // buffer addr algined to 32 F32 *packB = (F32 *)align_addr(dst, 32); for (U32 bk = 0; bk < K; bk += blockSizeK) { blockSizeK = UNI_MIN(BOLCK_K_DIM, K - bk); for (U32 un = 0; un < N; un += unrollSizeN) { - unrollSizeN = UNI_MIN(UNROLL_N, N - un); - unrollSizeN = UNI_MIN(unrollSize[unrollSizeN / 8], unrollSizeN); - matrix2_trans(unrollSizeN, blockSizeK, N, src + un, packB); + unrollSizeN = UNI_MAX(UNI_MIN(UNROLL_N, N - un), edgeBlockNSize); + matrix2_trans_w( + unrollSizeN, UNI_MIN(N - un, unrollSizeN), blockSizeK, N, src + un, packB); packB += unrollSizeN * blockSizeK; } src += blockSizeK * N; @@ -62,15 +624,18 @@ EE matrix_matrix_multiply_transform_rhsT_fp32(TensorDesc desc, F32 *src, F32 *ds U32 N, K, blockSizeK, unrollSizeN; CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); F32 unrollSize[4] = {4, 8, 16, 24}; + U32 resN = N % UNROLL_N; + U32 edgeBlockNSizeIdx = (resN > 4) ? ((resN + 7) / 8) : 0; + U32 edgeBlockNSize = unrollSize[edgeBlockNSizeIdx]; // buffer addr aligned to 32 F32 *packB = (F32 *)align_addr(dst, 32); for (U32 bk = 0; bk < K; bk += blockSizeK) { blockSizeK = UNI_MIN(BOLCK_K_DIM, K - bk); for (U32 un = 0; un < N; un += unrollSizeN) { - unrollSizeN = UNI_MIN(UNROLL_N, N - un); - unrollSizeN = UNI_MIN(unrollSize[unrollSizeN >> 3], unrollSizeN); - matrix1_trans(unrollSizeN, blockSizeK, K, src + un * K, packB); + unrollSizeN = UNI_MAX(UNI_MIN(UNROLL_N, N - un), edgeBlockNSize); + matrix1_trans_w( + unrollSizeN, UNI_MIN(N - un, unrollSizeN), blockSizeK, K, src + un * K, packB); packB += unrollSizeN * blockSizeK; } src += blockSizeK; @@ -78,1367 +643,79 @@ EE matrix_matrix_multiply_transform_rhsT_fp32(TensorDesc desc, F32 *src, F32 *ds return SUCCESS; } -void mmm_avx2_4x24_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) -{ - __asm__ __volatile__("vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" - "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" - "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" - "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" - "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" - "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" - "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" - "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" - "vxorps %%ymm8, %%ymm8, %%ymm8 \n\t" - "vxorps %%ymm9, %%ymm9, %%ymm9 \n\t" - "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" - "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" - - "mov %0, %%ecx \n\t" - "shr $2, %%ecx \n\t" - "je .k_loop_4x24_end \n\t" - ".align 16 \n\t" - ".k_loop_4x24: \n\t" - - "prefetcht0 0x140(%1) \n\t" - "prefetcht0 0x180(%1) \n\t" - "prefetcht0 0x140(%2) \n\t" - - "vmovaps (%1), %%ymm12 \n\t" - "vmovaps 0x20(%1), %%ymm13 \n\t" - "vmovaps 0x40(%1), %%ymm14 \n\t" - "vbroadcastss 0x0(%2), %%ymm15 \n\t" - "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" - "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" - "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" - "vbroadcastss 0x4(%2), %%ymm15 \n\t" - "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" - "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" - "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" - "vbroadcastss 0x8(%2), %%ymm15 \n\t" - "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" - "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" - "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" - "vbroadcastss 0xC(%2), %%ymm15 \n\t" - "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" - "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" - "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" - - "prefetcht0 0x1C0(%1) \n\t" - - "vmovaps 0x60(%1), %%ymm12 \n\t" - "vmovaps 0x80(%1), %%ymm13 \n\t" - "vmovaps 0xA0(%1), %%ymm14 \n\t" - "vbroadcastss 0x10(%2), %%ymm15 \n\t" - "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" - "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" - "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" - "vbroadcastss 0x14(%2), %%ymm15 \n\t" - "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" - "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" - "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" - "vbroadcastss 0x18(%2), %%ymm15 \n\t" - "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" - "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" - "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" - "vbroadcastss 0x1C(%2), %%ymm15 \n\t" - "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" - "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" - "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" - - "prefetcht0 0x200(%1) \n\t" - "prefetcht0 0x240(%1) \n\t" - - "vmovaps 0xC0(%1), %%ymm12 \n\t" - "vmovaps 0xE0(%1), %%ymm13 \n\t" - "vmovaps 0x100(%1), %%ymm14 \n\t" - "vbroadcastss 0x20(%2), %%ymm15 \n\t" - "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" - "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" - "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" - "vbroadcastss 0x24(%2), %%ymm15 \n\t" - "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" - "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" - "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" - "vbroadcastss 0x28(%2), %%ymm15 \n\t" - "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" - "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" - "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" - "vbroadcastss 0x2C(%2), %%ymm15 \n\t" - "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" - "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" - "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" - - "prefetcht0 0x280(%1) \n\t" - - "vmovaps 0x120(%1), %%ymm12 \n\t" - "vmovaps 0x140(%1), %%ymm13 \n\t" - "vmovaps 0x160(%1), %%ymm14 \n\t" - "vbroadcastss 0x30(%2), %%ymm15 \n\t" - "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" - "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" - "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" - "vbroadcastss 0x34(%2), %%ymm15 \n\t" - "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" - "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" - "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" - "vbroadcastss 0x38(%2), %%ymm15 \n\t" - "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" - "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" - "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" - "vbroadcastss 0x3C(%2), %%ymm15 \n\t" - "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" - "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" - "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" - - "add $0x180, %1 \n\t" - "add $0x40, %2 \n\t" - - "sub $1, %%ecx \n\t" - "jg .k_loop_4x24 \n\t" - ".align 16 \n\t" - ".k_loop_4x24_end: \n\t" - - "mov %0, %%ecx \n\t" - "and $3, %%ecx \n\t" - "je .k_loop_4x24_remain_end \n\t" - ".k_loop_4x24_remain: \n\t" - "vmovaps (%1), %%ymm12 \n\t" - "vmovaps 0x20(%1), %%ymm13 \n\t" - "vmovaps 0x40(%1), %%ymm14 \n\t" - "vbroadcastss 0x0(%2), %%ymm15 \n\t" - "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" - "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" - "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" - "vbroadcastss 0x4(%2), %%ymm15 \n\t" - "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" - "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" - "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" - "vbroadcastss 0x8(%2), %%ymm15 \n\t" - "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" - "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" - "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" - "vbroadcastss 0xC(%2), %%ymm15 \n\t" - "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" - "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" - "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" - "add $0x60, %1 \n\t" - "add $0x10, %2 \n\t" - "sub $1, %%ecx \n\t" - "jg .k_loop_4x24_remain \n\t" - - ".k_loop_4x24_remain_end: \n\t" - "mov %4, %%eax \n\t" - "shl $2, %%eax \n\t" - "mov %%eax, %%eax \n\t" - "prefetcht0 0x40(%3) \n\t" - "prefetcht0 (%3, %%rax) \n\t" - "prefetcht0 0x40(%3, %%rax) \n\t" - "vaddps (%3), %%ymm0, %%ymm0 \n\t" - "vaddps 0x20(%3), %%ymm1, %%ymm1 \n\t" - "vaddps 0x40(%3), %%ymm2, %%ymm2 \n\t" - "vmovups %%ymm0, (%3) \n\t" - "vmovups %%ymm1, 0x20(%3) \n\t" - "vmovups %%ymm2, 0x40(%3) \n\t" - "add %%rax, %3 \n\t" - "prefetcht0 (%3, %%rax) \n\t" - "prefetcht0 0x40(%3, %%rax) \n\t" - "vaddps (%3), %%ymm3, %%ymm3 \n\t" - "vaddps 0x20(%3), %%ymm4, %%ymm4 \n\t" - "vaddps 0x40(%3), %%ymm5, %%ymm5 \n\t" - "vmovups %%ymm3, (%3) \n\t" - "vmovups %%ymm4, 0x20(%3) \n\t" - "vmovups %%ymm5, 0x40(%3) \n\t" - "add %%rax, %3 \n\t" - "prefetcht0 (%3, %%rax) \n\t" - "prefetcht0 0x40(%3, %%rax) \n\t" - "vaddps (%3), %%ymm6, %%ymm6 \n\t" - "vaddps 0x20(%3), %%ymm7, %%ymm7 \n\t" - "vaddps 0x40(%3), %%ymm8, %%ymm8 \n\t" - "vmovups %%ymm6, (%3) \n\t" - "vmovups %%ymm7, 0x20(%3) \n\t" - "vmovups %%ymm8, 0x40(%3) \n\t" - "add %%rax, %3 \n\t" - "prefetcht0 0x40(%3) \n\t" - "vaddps (%3), %%ymm9, %%ymm9 \n\t" - "vaddps 0x20(%3), %%ymm10, %%ymm10 \n\t" - "vaddps 0x40(%3), %%ymm11, %%ymm11 \n\t" - "vmovups %%ymm9, (%3) \n\t" - "vmovups %%ymm10, 0x20(%3) \n\t" - "vmovups %%ymm11, 0x40(%3) \n\t" - : - : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) - : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", - "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", - "%ymm13", "%ymm14", "%ymm15", "memory"); -} - -void mmm_avx2_4x16_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) -{ - __asm__ __volatile__("vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" - "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" - "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" - "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" - "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" - "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" - "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" - "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" - - "mov %0, %%ecx \n\t" - "shr $2, %%ecx \n\t" - "je .k_loop_4x16_end \n\t" - ".align 16 \n\t" - ".k_loop_4x16: \n\t" - - "prefetcht0 0x140(%1) \n\t" - "prefetcht0 0x140(%2) \n\t" - - "vmovaps (%1), %%ymm8 \n\t" - "vmovaps 0x20(%1), %%ymm9 \n\t" - "vbroadcastss 0x0(%2), %%ymm10 \n\t" - "vfmadd231ps %%ymm10, %%ymm8, %%ymm0 \n\t" - "vfmadd231ps %%ymm10, %%ymm9, %%ymm1 \n\t" - "vbroadcastss 0x4(%2), %%ymm10 \n\t" - "vfmadd231ps %%ymm10, %%ymm8, %%ymm2 \n\t" - "vfmadd231ps %%ymm10, %%ymm9, %%ymm3 \n\t" - "vbroadcastss 0x8(%2), %%ymm10 \n\t" - "vfmadd231ps %%ymm10, %%ymm8, %%ymm4 \n\t" - "vfmadd231ps %%ymm10, %%ymm9, %%ymm5 \n\t" - "vbroadcastss 0xC(%2), %%ymm10 \n\t" - "vfmadd231ps %%ymm10, %%ymm8, %%ymm6 \n\t" - "vfmadd231ps %%ymm10, %%ymm9, %%ymm7 \n\t" - - "prefetcht0 0x180(%1) \n\t" - - "vmovaps 0x40(%1), %%ymm8 \n\t" - "vmovaps 0x60(%1), %%ymm9 \n\t" - "vbroadcastss 0x10(%2), %%ymm10 \n\t" - "vfmadd231ps %%ymm10, %%ymm8, %%ymm0 \n\t" - "vfmadd231ps %%ymm10, %%ymm9, %%ymm1 \n\t" - "vbroadcastss 0x14(%2), %%ymm10 \n\t" - "vfmadd231ps %%ymm10, %%ymm8, %%ymm2 \n\t" - "vfmadd231ps %%ymm10, %%ymm9, %%ymm3 \n\t" - "vbroadcastss 0x18(%2), %%ymm10 \n\t" - "vfmadd231ps %%ymm10, %%ymm8, %%ymm4 \n\t" - "vfmadd231ps %%ymm10, %%ymm9, %%ymm5 \n\t" - "vbroadcastss 0x1C(%2), %%ymm10 \n\t" - "vfmadd231ps %%ymm10, %%ymm8, %%ymm6 \n\t" - "vfmadd231ps %%ymm10, %%ymm9, %%ymm7 \n\t" - - "prefetcht0 0x1C0(%1) \n\t" - - "vmovaps 0x80(%1), %%ymm8 \n\t" - "vmovaps 0xA0(%1), %%ymm9 \n\t" - "vbroadcastss 0x20(%2), %%ymm10 \n\t" - "vfmadd231ps %%ymm10, %%ymm8, %%ymm0 \n\t" - "vfmadd231ps %%ymm10, %%ymm9, %%ymm1 \n\t" - "vbroadcastss 0x24(%2), %%ymm10 \n\t" - "vfmadd231ps %%ymm10, %%ymm8, %%ymm2 \n\t" - "vfmadd231ps %%ymm10, %%ymm9, %%ymm3 \n\t" - "vbroadcastss 0x28(%2), %%ymm10 \n\t" - "vfmadd231ps %%ymm10, %%ymm8, %%ymm4 \n\t" - "vfmadd231ps %%ymm10, %%ymm9, %%ymm5 \n\t" - "vbroadcastss 0x2C(%2), %%ymm10 \n\t" - "vfmadd231ps %%ymm10, %%ymm8, %%ymm6 \n\t" - "vfmadd231ps %%ymm10, %%ymm9, %%ymm7 \n\t" - - "prefetcht0 0x200(%1) \n\t" - - "vmovaps 0xC0(%1), %%ymm8 \n\t" - "vmovaps 0xE0(%1), %%ymm9 \n\t" - "vbroadcastss 0x30(%2), %%ymm10 \n\t" - "vfmadd231ps %%ymm10, %%ymm8, %%ymm0 \n\t" - "vfmadd231ps %%ymm10, %%ymm9, %%ymm1 \n\t" - "vbroadcastss 0x34(%2), %%ymm10 \n\t" - "vfmadd231ps %%ymm10, %%ymm8, %%ymm2 \n\t" - "vfmadd231ps %%ymm10, %%ymm9, %%ymm3 \n\t" - "vbroadcastss 0x38(%2), %%ymm10 \n\t" - "vfmadd231ps %%ymm10, %%ymm8, %%ymm4 \n\t" - "vfmadd231ps %%ymm10, %%ymm9, %%ymm5 \n\t" - "vbroadcastss 0x3C(%2), %%ymm10 \n\t" - "vfmadd231ps %%ymm10, %%ymm8, %%ymm6 \n\t" - "vfmadd231ps %%ymm10, %%ymm9, %%ymm7 \n\t" - - "add $0x100, %1 \n\t" - "add $0x40, %2 \n\t" - - "sub $1, %%ecx \n\t" - "jg .k_loop_4x16 \n\t" - ".align 16 \n\t" - ".k_loop_4x16_end: \n\t" - - "mov %0, %%ecx \n\t" - "and $3, %%ecx \n\t" - "je .k_loop_4x16_remain_end \n\t" - ".k_loop_4x16_remain: \n\t" - "vmovaps (%1), %%ymm8 \n\t" - "vmovaps 0x20(%1), %%ymm9 \n\t" - "vbroadcastss 0x0(%2), %%ymm10 \n\t" - "vfmadd231ps %%ymm10, %%ymm8, %%ymm0 \n\t" - "vfmadd231ps %%ymm10, %%ymm9, %%ymm1 \n\t" - "vbroadcastss 0x4(%2), %%ymm10 \n\t" - "vfmadd231ps %%ymm10, %%ymm8, %%ymm2 \n\t" - "vfmadd231ps %%ymm10, %%ymm9, %%ymm3 \n\t" - "vbroadcastss 0x8(%2), %%ymm10 \n\t" - "vfmadd231ps %%ymm10, %%ymm8, %%ymm4 \n\t" - "vfmadd231ps %%ymm10, %%ymm9, %%ymm5 \n\t" - "vbroadcastss 0xC(%2), %%ymm10 \n\t" - "vfmadd231ps %%ymm10, %%ymm8, %%ymm6 \n\t" - "vfmadd231ps %%ymm10, %%ymm9, %%ymm7 \n\t" - "add $0x40, %1 \n\t" - "add $0x10, %2 \n\t" - "sub $1, %%ecx \n\t" - "jg .k_loop_4x16_remain \n\t" - - ".k_loop_4x16_remain_end: \n\t" - "mov %4, %%eax \n\t" - "shl $2, %%eax \n\t" - "mov %%eax, %%eax \n\t" - "prefetcht0 (%3, %%rax) \n\t" - "vaddps (%3), %%ymm0, %%ymm0 \n\t" - "vaddps 0x20(%3), %%ymm1, %%ymm1 \n\t" - "vmovups %%ymm0, (%3) \n\t" - "vmovups %%ymm1, 0x20(%3) \n\t" - "add %%rax, %3 \n\t" - "prefetcht0 (%3, %%rax) \n\t" - "vaddps (%3), %%ymm2, %%ymm2 \n\t" - "vaddps 0x20(%3), %%ymm3, %%ymm3 \n\t" - "vmovups %%ymm2, (%3) \n\t" - "vmovups %%ymm3, 0x20(%3) \n\t" - "add %%rax, %3 \n\t" - "prefetcht0 (%3, %%rax) \n\t" - "vaddps (%3), %%ymm4, %%ymm4 \n\t" - "vaddps 0x20(%3), %%ymm5, %%ymm5 \n\t" - "vmovups %%ymm4, (%3) \n\t" - "vmovups %%ymm5, 0x20(%3) \n\t" - "add %%rax, %3 \n\t" - "vaddps (%3), %%ymm6, %%ymm6 \n\t" - "vaddps 0x20(%3), %%ymm7, %%ymm7 \n\t" - "vmovups %%ymm6, (%3) \n\t" - "vmovups %%ymm7, 0x20(%3) \n\t" - : - : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) - : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", - "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "memory"); -} - -void mmm_avx2_4x8_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) -{ - __asm__ __volatile__( - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" - "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" - "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" - "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" - - "mov %0, %%ecx \n\t" - "shr $2, %%ecx \n\t" - "je .k_loop_4x8_end \n\t" - ".align 16 \n\t" - ".k_loop_4x8: \n\t" - - "prefetcht0 0x140(%1) \n\t" - "prefetcht0 0x140(%2) \n\t" - - "vmovaps (%1), %%ymm4 \n\t" - "vbroadcastss 0x0(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm4, %%ymm0 \n\t" - "vbroadcastss 0x4(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm4, %%ymm1 \n\t" - "vbroadcastss 0x8(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm4, %%ymm2 \n\t" - "vbroadcastss 0xC(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm4, %%ymm3 \n\t" - - "vmovaps 0x20(%1), %%ymm4 \n\t" - "vbroadcastss 0x10(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm4, %%ymm0 \n\t" - "vbroadcastss 0x14(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm4, %%ymm1 \n\t" - "vbroadcastss 0x18(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm4, %%ymm2 \n\t" - "vbroadcastss 0x1C(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm4, %%ymm3 \n\t" - - "prefetcht0 0x180(%1) \n\t" - - "vmovaps 0x40(%1), %%ymm4 \n\t" - "vbroadcastss 0x20(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm4, %%ymm0 \n\t" - "vbroadcastss 0x24(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm4, %%ymm1 \n\t" - "vbroadcastss 0x28(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm4, %%ymm2 \n\t" - "vbroadcastss 0x2C(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm4, %%ymm3 \n\t" - - "vmovaps 0x60(%1), %%ymm4 \n\t" - "vbroadcastss 0x30(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm4, %%ymm0 \n\t" - "vbroadcastss 0x34(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm4, %%ymm1 \n\t" - "vbroadcastss 0x38(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm4, %%ymm2 \n\t" - "vbroadcastss 0x3C(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm4, %%ymm3 \n\t" - - "add $0x80, %1 \n\t" - "add $0x40, %2 \n\t" - - "sub $1, %%ecx \n\t" - "jg .k_loop_4x8 \n\t" - ".align 16 \n\t" - ".k_loop_4x8_end: \n\t" - - "mov %0, %%ecx \n\t" - "and $3, %%ecx \n\t" - "je .k_loop_4x8_remain_end \n\t" - ".k_loop_4x8_remain: \n\t" - "vmovaps (%1), %%ymm4 \n\t" - "vbroadcastss 0x0(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm4, %%ymm0 \n\t" - "vbroadcastss 0x4(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm4, %%ymm1 \n\t" - "vbroadcastss 0x8(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm4, %%ymm2 \n\t" - "vbroadcastss 0xC(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm4, %%ymm3 \n\t" - "add $0x20, %1 \n\t" - "add $0x10, %2 \n\t" - "sub $1, %%ecx \n\t" - "jg .k_loop_4x8_remain \n\t" - - ".k_loop_4x8_remain_end: \n\t" - "mov %4, %%eax \n\t" - "shl $2, %%eax \n\t" - "mov %%eax, %%eax \n\t" - "prefetcht0 (%3, %%rax) \n\t" - "vaddps (%3), %%ymm0, %%ymm0 \n\t" - "vmovups %%ymm0, (%3) \n\t" - "add %%rax, %3 \n\t" - "prefetcht0 (%3, %%rax) \n\t" - "vaddps (%3), %%ymm1, %%ymm1 \n\t" - "vmovups %%ymm1, (%3) \n\t" - "add %%rax, %3 \n\t" - "prefetcht0 (%3, %%rax) \n\t" - "vaddps (%3), %%ymm2, %%ymm2 \n\t" - "vmovups %%ymm2, (%3) \n\t" - "add %%rax, %3 \n\t" - "vaddps (%3), %%ymm3, %%ymm3 \n\t" - "vmovups %%ymm3, (%3) \n\t" - : - : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) - : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "memory"); -} - -void mmm_avx2_4x4_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) -{ - __asm__ __volatile__( - "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" - "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" - "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" - "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" - - "mov %0, %%ecx \n\t" - "shr $2, %%ecx \n\t" - "je .k_loop_4x4_end \n\t" - ".align 16 \n\t" - ".k_loop_4x4: \n\t" - - "prefetcht0 0x140(%1) \n\t" - "prefetcht0 0x140(%2) \n\t" - - "vmovaps (%1), %%xmm4 \n\t" - "vbroadcastss 0x0(%2), %%xmm5 \n\t" - "vfmadd231ps %%xmm5, %%xmm4, %%xmm0 \n\t" - "vbroadcastss 0x4(%2), %%xmm5 \n\t" - "vfmadd231ps %%xmm5, %%xmm4, %%xmm1 \n\t" - "vbroadcastss 0x8(%2), %%xmm5 \n\t" - "vfmadd231ps %%xmm5, %%xmm4, %%xmm2 \n\t" - "vbroadcastss 0xC(%2), %%xmm5 \n\t" - "vfmadd231ps %%xmm5, %%xmm4, %%xmm3 \n\t" - - "vmovaps 0x10(%1), %%xmm4 \n\t" - "vbroadcastss 0x10(%2), %%xmm5 \n\t" - "vfmadd231ps %%xmm5, %%xmm4, %%xmm0 \n\t" - "vbroadcastss 0x14(%2), %%xmm5 \n\t" - "vfmadd231ps %%xmm5, %%xmm4, %%xmm1 \n\t" - "vbroadcastss 0x18(%2), %%xmm5 \n\t" - "vfmadd231ps %%xmm5, %%xmm4, %%xmm2 \n\t" - "vbroadcastss 0x1C(%2), %%xmm5 \n\t" - "vfmadd231ps %%xmm5, %%xmm4, %%xmm3 \n\t" - - "vmovaps 0x20(%1), %%xmm4 \n\t" - "vbroadcastss 0x20(%2), %%xmm5 \n\t" - "vfmadd231ps %%xmm5, %%xmm4, %%xmm0 \n\t" - "vbroadcastss 0x24(%2), %%xmm5 \n\t" - "vfmadd231ps %%xmm5, %%xmm4, %%xmm1 \n\t" - "vbroadcastss 0x28(%2), %%xmm5 \n\t" - "vfmadd231ps %%xmm5, %%xmm4, %%xmm2 \n\t" - "vbroadcastss 0x2C(%2), %%xmm5 \n\t" - "vfmadd231ps %%xmm5, %%xmm4, %%xmm3 \n\t" - - "vmovaps 0x30(%1), %%xmm4 \n\t" - "vbroadcastss 0x30(%2), %%xmm5 \n\t" - "vfmadd231ps %%xmm5, %%xmm4, %%xmm0 \n\t" - "vbroadcastss 0x34(%2), %%xmm5 \n\t" - "vfmadd231ps %%xmm5, %%xmm4, %%xmm1 \n\t" - "vbroadcastss 0x38(%2), %%xmm5 \n\t" - "vfmadd231ps %%xmm5, %%xmm4, %%xmm2 \n\t" - "vbroadcastss 0x3C(%2), %%xmm5 \n\t" - "vfmadd231ps %%xmm5, %%xmm4, %%xmm3 \n\t" - - "add $0x40, %1 \n\t" - "add $0x40, %2 \n\t" - - "sub $1, %%ecx \n\t" - "jg .k_loop_4x4 \n\t" - ".align 16 \n\t" - ".k_loop_4x4_end: \n\t" - - "mov %0, %%ecx \n\t" - "and $3, %%ecx \n\t" - "je .k_loop_4x4_remain_end \n\t" - - ".k_loop_4x4_remain: \n\t" - "vmovaps (%1), %%xmm4 \n\t" - "vbroadcastss 0x0(%2), %%xmm5 \n\t" - "vfmadd231ps %%xmm5, %%xmm4, %%xmm0 \n\t" - "vbroadcastss 0x4(%2), %%xmm5 \n\t" - "vfmadd231ps %%xmm5, %%xmm4, %%xmm1 \n\t" - "vbroadcastss 0x8(%2), %%xmm5 \n\t" - "vfmadd231ps %%xmm5, %%xmm4, %%xmm2 \n\t" - "vbroadcastss 0xC(%2), %%xmm5 \n\t" - "vfmadd231ps %%xmm5, %%xmm4, %%xmm3 \n\t" - "add $0x10, %1 \n\t" - "add $0x10, %2 \n\t" - "sub $1, %%ecx \n\t" - "jg .k_loop_4x4_remain \n\t" - - ".k_loop_4x4_remain_end: \n\t" - "mov %4, %%eax \n\t" - "shl $2, %%eax \n\t" - "mov %%eax, %%eax \n\t" - "prefetcht0 (%3, %%rax) \n\t" - "vaddps (%3), %%xmm0, %%xmm0 \n\t" - "vmovups %%xmm0, (%3) \n\t" - "add %%rax, %3 \n\t" - "prefetcht0 (%3, %%rax) \n\t" - "vaddps (%3), %%xmm1, %%xmm1 \n\t" - "vmovups %%xmm1, (%3) \n\t" - "add %%rax, %3 \n\t" - "prefetcht0 (%3, %%rax) \n\t" - "vaddps (%3), %%xmm2, %%xmm2 \n\t" - "vmovups %%xmm2, (%3) \n\t" - "add %%rax, %3 \n\t" - "vaddps (%3), %%xmm3, %%xmm3 \n\t" - "vmovups %%xmm3, (%3) \n\t" - : - : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) - : "%eax", "%rax", "%ecx", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "memory"); -} - -void mmm_avx2_2x24_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) -{ - __asm__ __volatile__("mov %4, %%eax \n\t" - "shl $2, %%eax \n\t" - "mov %%eax, %%eax \n\t" - - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" - "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" - "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" - "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" - "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" - "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" - - "mov %0, %%ecx \n\t" - "shr $2, %%ecx \n\t" - "je .k_loop_2x24_end \n\t" - - ".align 16 \n\t" - ".k_loop_2x24: \n\t" - - "prefetcht0 0x140(%1) \n\t" - "prefetcht0 0x180(%1) \n\t" - - "vmovaps (%1), %%ymm6 \n\t" - "vmovaps 0x20(%1), %%ymm7 \n\t" - "vmovaps 0x40(%1), %%ymm8 \n\t" - "vbroadcastss 0x0(%2), %%ymm9 \n\t" - "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" - "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" - "vfmadd231ps %%ymm9, %%ymm8, %%ymm2 \n\t" - "vbroadcastss 0x4(%2), %%ymm9 \n\t" - "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" - "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" - "vfmadd231ps %%ymm9, %%ymm8, %%ymm5 \n\t" - - "prefetcht0 0x1C0(%1) \n\t" - - "vmovaps 0x60(%1), %%ymm6 \n\t" - "vmovaps 0x80(%1), %%ymm7 \n\t" - "vmovaps 0xA0(%1), %%ymm8 \n\t" - "vbroadcastss 0x8(%2), %%ymm9 \n\t" - "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" - "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" - "vfmadd231ps %%ymm9, %%ymm8, %%ymm2 \n\t" - "vbroadcastss 0xC(%2), %%ymm9 \n\t" - "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" - "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" - "vfmadd231ps %%ymm9, %%ymm8, %%ymm5 \n\t" - - "prefetcht0 0x200(%1) \n\t" - "prefetcht0 0x240(%1) \n\t" - - "vmovaps 0xC0(%1), %%ymm6 \n\t" - "vmovaps 0xE0(%1), %%ymm7 \n\t" - "vmovaps 0x100(%1), %%ymm8 \n\t" - "vbroadcastss 0x10(%2), %%ymm9 \n\t" - "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" - "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" - "vfmadd231ps %%ymm9, %%ymm8, %%ymm2 \n\t" - "vbroadcastss 0x14(%2), %%ymm9 \n\t" - "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" - "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" - "vfmadd231ps %%ymm9, %%ymm8, %%ymm5 \n\t" - - "prefetcht0 0x280(%1) \n\t" - - "vmovaps 0x120(%1), %%ymm6 \n\t" - "vmovaps 0x140(%1), %%ymm7 \n\t" - "vmovaps 0x160(%1), %%ymm8 \n\t" - "vbroadcastss 0x18(%2), %%ymm9 \n\t" - "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" - "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" - "vfmadd231ps %%ymm9, %%ymm8, %%ymm2 \n\t" - "vbroadcastss 0x1C(%2), %%ymm9 \n\t" - "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" - "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" - "vfmadd231ps %%ymm9, %%ymm8, %%ymm5 \n\t" - - "add $0x180, %1 \n\t" - "add $0x20, %2 \n\t" - - "sub $1, %%ecx \n\t" - "jg .k_loop_2x24 \n\t" - ".align 16 \n\t" - ".k_loop_2x24_end: \n\t" - - "mov %0, %%ecx \n\t" - "and $3, %%ecx \n\t" - "je .k_loop_2x24_remain_end \n\t" - - ".align 16 \n\t" - ".k_loop_2x24_remain: \n\t" - "vmovaps (%1), %%ymm6 \n\t" - "vmovaps 0x20(%1), %%ymm7 \n\t" - "vmovaps 0x40(%1), %%ymm8 \n\t" - "vbroadcastss 0x0(%2), %%ymm9 \n\t" - "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" - "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" - "vfmadd231ps %%ymm9, %%ymm8, %%ymm2 \n\t" - "vbroadcastss 0x4(%2), %%ymm9 \n\t" - "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" - "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" - "vfmadd231ps %%ymm9, %%ymm8, %%ymm5 \n\t" - "add $0x60, %1 \n\t" - "add $0x8, %2 \n\t" - "sub $1, %%ecx \n\t" - "jg .k_loop_2x24_remain \n\t" - - ".align 16 \n\t" - ".k_loop_2x24_remain_end: \n\t" - "prefetcht0 (%3, %%rax) \n\t" - "prefetcht0 0x40(%3, %%rax) \n\t" - "vaddps (%3), %%ymm0, %%ymm0 \n\t" - "vaddps 0x20(%3), %%ymm1, %%ymm1 \n\t" - "vaddps 0x40(%3), %%ymm2, %%ymm2 \n\t" - "vmovups %%ymm0, (%3) \n\t" - "vmovups %%ymm1, 0x20(%3) \n\t" - "vmovups %%ymm2, 0x40(%3) \n\t" - "add %%rax, %3 \n\t" - "vaddps (%3), %%ymm3, %%ymm3 \n\t" - "vaddps 0x20(%3), %%ymm4, %%ymm4 \n\t" - "vaddps 0x40(%3), %%ymm5, %%ymm5 \n\t" - "vmovups %%ymm3, (%3) \n\t" - "vmovups %%ymm4, 0x20(%3) \n\t" - "vmovups %%ymm5, 0x40(%3) \n\t" - : - : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) - : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", - "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "memory"); -} - -void mmm_avx2_2x16_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) -{ - __asm__ __volatile__("mov %4, %%eax \n\t" - "shl $2, %%eax \n\t" - "mov %%eax, %%eax \n\t" - - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" - "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" - "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" - "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" - - "mov %0, %%ecx \n\t" - "shr $2, %%ecx \n\t" - "je .k_loop_2x16_end \n\t" - - ".align 16 \n\t" - ".k_loop_2x16: \n\t" - - "prefetcht0 0x140(%1) \n\t" - - "vmovaps (%1), %%ymm6 \n\t" - "vmovaps 0x20(%1), %%ymm7 \n\t" - "vbroadcastss 0x0(%2), %%ymm9 \n\t" - "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" - "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" - "vbroadcastss 0x4(%2), %%ymm9 \n\t" - "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" - "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" - - "prefetcht0 0x180(%1) \n\t" - - "vmovaps 0x40(%1), %%ymm6 \n\t" - "vmovaps 0x60(%1), %%ymm7 \n\t" - "vbroadcastss 0x8(%2), %%ymm9 \n\t" - "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" - "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" - "vbroadcastss 0xC(%2), %%ymm9 \n\t" - "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" - "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" - - "prefetcht0 0x1C0(%1) \n\t" - - "vmovaps 0x80(%1), %%ymm6 \n\t" - "vmovaps 0xA0(%1), %%ymm7 \n\t" - "vbroadcastss 0x10(%2), %%ymm9 \n\t" - "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" - "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" - "vbroadcastss 0x14(%2), %%ymm9 \n\t" - "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" - "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" - - "prefetcht0 0x200(%1) \n\t" - - "vmovaps 0xC0(%1), %%ymm6 \n\t" - "vmovaps 0xE0(%1), %%ymm7 \n\t" - "vbroadcastss 0x18(%2), %%ymm9 \n\t" - "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" - "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" - "vbroadcastss 0x1C(%2), %%ymm9 \n\t" - "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" - "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" - - "add $0x100, %1 \n\t" - "add $0x20, %2 \n\t" - - "sub $1, %%ecx \n\t" - "jg .k_loop_2x16 \n\t" - ".align 16 \n\t" - ".k_loop_2x16_end: \n\t" - - "mov %0, %%ecx \n\t" - "and $3, %%ecx \n\t" - "je .k_loop_2x16_remain_end \n\t" - - ".align 16 \n\t" - ".k_loop_2x16_remain: \n\t" - "vmovaps (%1), %%ymm6 \n\t" - "vmovaps 0x20(%1), %%ymm7 \n\t" - "vbroadcastss 0x0(%2), %%ymm9 \n\t" - "vfmadd231ps %%ymm9, %%ymm6, %%ymm0 \n\t" - "vfmadd231ps %%ymm9, %%ymm7, %%ymm1 \n\t" - "vbroadcastss 0x4(%2), %%ymm9 \n\t" - "vfmadd231ps %%ymm9, %%ymm6, %%ymm3 \n\t" - "vfmadd231ps %%ymm9, %%ymm7, %%ymm4 \n\t" - "add $0x40, %1 \n\t" - "add $0x8, %2 \n\t" - "sub $1, %%ecx \n\t" - "jg .k_loop_2x16_remain \n\t" - - ".align 16 \n\t" - ".k_loop_2x16_remain_end: \n\t" - "prefetcht0 (%3, %%rax) \n\t" - "vaddps (%3), %%ymm0, %%ymm0 \n\t" - "vaddps 0x20(%3), %%ymm1, %%ymm1 \n\t" - "vmovups %%ymm0, (%3) \n\t" - "vmovups %%ymm1, 0x20(%3) \n\t" - "add %%rax, %3 \n\t" - "prefetcht0 (%3, %%rax) \n\t" - "vaddps (%3), %%ymm3, %%ymm3 \n\t" - "vaddps 0x20(%3), %%ymm4, %%ymm4 \n\t" - "vmovups %%ymm3, (%3) \n\t" - "vmovups %%ymm4, 0x20(%3) \n\t" - : - : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) - : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm3", "%ymm4", "%ymm6", - "%ymm7", "%ymm9", "memory"); -} - -void mmm_avx2_2x8_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) -{ - __asm__ __volatile__("mov %4, %%eax \n\t" - "shl $2, %%eax \n\t" - "mov %%eax, %%eax \n\t" - - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" - "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" - - "mov %0, %%ecx \n\t" - "shr $2, %%ecx \n\t" - "je .k_loop_2x8_end \n\t" - - ".align 16 \n\t" - ".k_loop_2x8: \n\t" - - "prefetcht0 0x140(%1) \n\t" - "vmovaps (%1), %%ymm2 \n\t" - "vbroadcastss 0x0(%2), %%ymm3 \n\t" - "vfmadd231ps %%ymm3, %%ymm2, %%ymm0 \n\t" - "vbroadcastss 0x4(%2), %%ymm3 \n\t" - "vfmadd231ps %%ymm3, %%ymm2, %%ymm1 \n\t" - - "vmovaps 0x20(%1), %%ymm2 \n\t" - "vbroadcastss 0x8(%2), %%ymm3 \n\t" - "vfmadd231ps %%ymm3, %%ymm2, %%ymm0 \n\t" - "vbroadcastss 0xC(%2), %%ymm3 \n\t" - "vfmadd231ps %%ymm3, %%ymm2, %%ymm1 \n\t" - - "prefetcht0 0x180(%1) \n\t" - "vmovaps 0x40(%1), %%ymm2 \n\t" - "vbroadcastss 0x10(%2), %%ymm3 \n\t" - "vfmadd231ps %%ymm3, %%ymm2, %%ymm0 \n\t" - "vbroadcastss 0x14(%2), %%ymm3 \n\t" - "vfmadd231ps %%ymm3, %%ymm2, %%ymm1 \n\t" - - "vmovaps 0x60(%1), %%ymm2 \n\t" - "vbroadcastss 0x18(%2), %%ymm3 \n\t" - "vfmadd231ps %%ymm3, %%ymm2, %%ymm0 \n\t" - "vbroadcastss 0x1C(%2), %%ymm3 \n\t" - "vfmadd231ps %%ymm3, %%ymm2, %%ymm1 \n\t" - - "add $0x80, %1 \n\t" - "add $0x20, %2 \n\t" - - "sub $1, %%ecx \n\t" - "jg .k_loop_2x8 \n\t" - ".align 16 \n\t" - ".k_loop_2x8_end: \n\t" - - "mov %0, %%ecx \n\t" - "and $3, %%ecx \n\t" - "je .k_loop_2x8_remain_end \n\t" - - ".align 16 \n\t" - ".k_loop_2x8_remain: \n\t" - "vmovaps (%1), %%ymm2 \n\t" - "vbroadcastss 0x0(%2), %%ymm3 \n\t" - "vfmadd231ps %%ymm3, %%ymm2, %%ymm0 \n\t" - "vbroadcastss 0x4(%2), %%ymm3 \n\t" - "vfmadd231ps %%ymm3, %%ymm2, %%ymm1 \n\t" - "add $0x20, %1 \n\t" - "add $0x8, %2 \n\t" - "sub $1, %%ecx \n\t" - "jg .k_loop_2x8_remain \n\t" - - ".align 16 \n\t" - ".k_loop_2x8_remain_end: \n\t" - - "vaddps (%3), %%ymm0, %%ymm0 \n\t" - "vmovups %%ymm0, (%3) \n\t" - "add %%rax, %3 \n\t" - "vaddps (%3), %%ymm1, %%ymm1 \n\t" - "vmovups %%ymm1, (%3) \n\t" - : - : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) - : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "memory"); -} - -void mmm_avx2_2x4_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) -{ - __asm__ __volatile__("mov %4, %%eax \n\t" - "shl $2, %%eax \n\t" - "mov %%eax, %%eax \n\t" - - "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" - "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" - - "mov %0, %%ecx \n\t" - "shr $2, %%ecx \n\t" - "je .k_loop_2x4_end \n\t" - - ".align 16 \n\t" - ".k_loop_2x4: \n\t" - - "prefetcht0 0x140(%1) \n\t" - "vmovaps (%1), %%xmm2 \n\t" - "vbroadcastss 0x0(%2), %%xmm3 \n\t" - "vfmadd231ps %%xmm3, %%xmm2, %%xmm0 \n\t" - "vbroadcastss 0x4(%2), %%xmm3 \n\t" - "vfmadd231ps %%xmm3, %%xmm2, %%xmm1 \n\t" - - "vmovaps 0x10(%1), %%xmm2 \n\t" - "vbroadcastss 0x8(%2), %%xmm3 \n\t" - "vfmadd231ps %%xmm3, %%xmm2, %%xmm0 \n\t" - "vbroadcastss 0xC(%2), %%xmm3 \n\t" - "vfmadd231ps %%xmm3, %%xmm2, %%xmm1 \n\t" - - "vmovaps 0x20(%1), %%xmm2 \n\t" - "vbroadcastss 0x10(%2), %%xmm3 \n\t" - "vfmadd231ps %%xmm3, %%xmm2, %%xmm0 \n\t" - "vbroadcastss 0x14(%2), %%xmm3 \n\t" - "vfmadd231ps %%xmm3, %%xmm2, %%xmm1 \n\t" - - "vmovaps 0x30(%1), %%xmm2 \n\t" - "vbroadcastss 0x18(%2), %%xmm3 \n\t" - "vfmadd231ps %%xmm3, %%xmm2, %%xmm0 \n\t" - "vbroadcastss 0x1C(%2), %%xmm3 \n\t" - "vfmadd231ps %%xmm3, %%xmm2, %%xmm1 \n\t" - - "add $0x40, %1 \n\t" - "add $0x20, %2 \n\t" - - "sub $1, %%ecx \n\t" - "jg .k_loop_2x4 \n\t" - ".align 16 \n\t" - ".k_loop_2x4_end: \n\t" - - "mov %0, %%ecx \n\t" - "and $3, %%ecx \n\t" - "je .k_loop_2x4_remain_end \n\t" - - ".align 16 \n\t" - ".k_loop_2x4_remain: \n\t" - "vmovaps (%1), %%xmm2 \n\t" - "vbroadcastss 0x0(%2), %%xmm3 \n\t" - "vfmadd231ps %%xmm3, %%xmm2, %%xmm0 \n\t" - "vbroadcastss 0x4(%2), %%xmm3 \n\t" - "vfmadd231ps %%xmm3, %%xmm2, %%xmm1 \n\t" - "add $0x10, %1 \n\t" - "add $0x8, %2 \n\t" - "sub $1, %%ecx \n\t" - "jg .k_loop_2x4_remain \n\t" - - ".align 16 \n\t" - ".k_loop_2x4_remain_end: \n\t" - - "vaddps (%3), %%xmm0, %%xmm0 \n\t" - "vmovups %%xmm0, (%3) \n\t" - "add %%rax, %3 \n\t" - "vaddps (%3), %%xmm1, %%xmm1 \n\t" - "vmovups %%xmm1, (%3) \n\t" - : - : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) - : "%eax", "%rax", "%ecx", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "memory"); -} - -void mmm_avx2_1x24_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) -{ - __asm__ __volatile__("mov %4, %%eax \n\t" - "shl $2, %%eax \n\t" - "mov %%eax, %%eax \n\t" - - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" - "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" - "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" - - "mov %0, %%ecx \n\t" - "shr $2, %%ecx \n\t" - "je .k_loop_1x24_end \n\t" - - ".align 16 \n\t" - ".k_loop_1x24: \n\t" - - "prefetcht0 0x140(%1) \n\t" - "prefetcht0 0x180(%1) \n\t" - - "vmovaps (%1), %%ymm3 \n\t" - "vmovaps 0x20(%1), %%ymm4 \n\t" - "vmovaps 0x40(%1), %%ymm5 \n\t" - "vbroadcastss 0x0(%2), %%ymm6 \n\t" - "vfmadd231ps %%ymm6, %%ymm3, %%ymm0 \n\t" - "vfmadd231ps %%ymm6, %%ymm4, %%ymm1 \n\t" - "vfmadd231ps %%ymm6, %%ymm5, %%ymm2 \n\t" - - "prefetcht0 0x1C0(%1) \n\t" - - "vmovaps 0x60(%1), %%ymm3 \n\t" - "vmovaps 0x80(%1), %%ymm4 \n\t" - "vmovaps 0xA0(%1), %%ymm5 \n\t" - "vbroadcastss 0x4(%2), %%ymm6 \n\t" - "vfmadd231ps %%ymm6, %%ymm3, %%ymm0 \n\t" - "vfmadd231ps %%ymm6, %%ymm4, %%ymm1 \n\t" - "vfmadd231ps %%ymm6, %%ymm5, %%ymm2 \n\t" - - "prefetcht0 0x200(%1) \n\t" - "prefetcht0 0x240(%1) \n\t" - - "vmovaps 0xC0(%1), %%ymm3 \n\t" - "vmovaps 0xE0(%1), %%ymm4 \n\t" - "vmovaps 0x100(%1), %%ymm5 \n\t" - "vbroadcastss 0x8(%2), %%ymm6 \n\t" - "vfmadd231ps %%ymm6, %%ymm3, %%ymm0 \n\t" - "vfmadd231ps %%ymm6, %%ymm4, %%ymm1 \n\t" - "vfmadd231ps %%ymm6, %%ymm5, %%ymm2 \n\t" - - "prefetcht0 0x280(%1) \n\t" - - "vmovaps 0x120(%1), %%ymm3 \n\t" - "vmovaps 0x140(%1), %%ymm4 \n\t" - "vmovaps 0x160(%1), %%ymm5 \n\t" - "vbroadcastss 0xC(%2), %%ymm6 \n\t" - "vfmadd231ps %%ymm6, %%ymm3, %%ymm0 \n\t" - "vfmadd231ps %%ymm6, %%ymm4, %%ymm1 \n\t" - "vfmadd231ps %%ymm6, %%ymm5, %%ymm2 \n\t" - - "add $0x180, %1 \n\t" - "add $0x10, %2 \n\t" - - "sub $1, %%ecx \n\t" - "jg .k_loop_1x24 \n\t" - ".align 16 \n\t" - ".k_loop_1x24_end: \n\t" - - "mov %0, %%ecx \n\t" - "and $3, %%ecx \n\t" - "je .k_loop_1x24_remain_end \n\t" - - ".align 16 \n\t" - ".k_loop_1x24_remain: \n\t" - "vmovaps (%1), %%ymm3 \n\t" - "vmovaps 0x20(%1), %%ymm4 \n\t" - "vmovaps 0x40(%1), %%ymm5 \n\t" - "vbroadcastss (%2), %%ymm6 \n\t" - "vfmadd231ps %%ymm6, %%ymm3, %%ymm0 \n\t" - "vfmadd231ps %%ymm6, %%ymm4, %%ymm1 \n\t" - "vfmadd231ps %%ymm6, %%ymm5, %%ymm2 \n\t" - "add $0x60, %1 \n\t" - "add $0x4, %2 \n\t" - "sub $1, %%ecx \n\t" - "jg .k_loop_1x24_remain \n\t" - - ".align 16 \n\t" - ".k_loop_1x24_remain_end: \n\t" - "prefetcht0 (%3, %%rax) \n\t" - "prefetcht0 0x40(%3, %%rax) \n\t" - "vaddps (%3), %%ymm0, %%ymm0 \n\t" - "vaddps 0x20(%3), %%ymm1, %%ymm1 \n\t" - "vaddps 0x40(%3), %%ymm2, %%ymm2 \n\t" - "vmovups %%ymm0, (%3) \n\t" - "vmovups %%ymm1, 0x20(%3) \n\t" - "vmovups %%ymm2, 0x40(%3) \n\t" - : - : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) - : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", - "%ymm5", "%ymm6", "memory"); -} - -void mmm_avx2_1x16_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) -{ - __asm__ __volatile__( - "mov %4, %%eax \n\t" - "shl $2, %%eax \n\t" - "mov %%eax, %%eax \n\t" - - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" - "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" - - "mov %0, %%ecx \n\t" - "shr $2, %%ecx \n\t" - "je .k_loop_1x16_end \n\t" - - ".align 16 \n\t" - ".k_loop_1x16: \n\t" - - "prefetcht0 0x140(%1) \n\t" - - "vmovaps (%1), %%ymm2 \n\t" - "vmovaps 0x20(%1), %%ymm3 \n\t" - "vbroadcastss (%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm2, %%ymm0 \n\t" - "vfmadd231ps %%ymm5, %%ymm3, %%ymm1 \n\t" - - "prefetcht0 0x180(%1) \n\t" - - "vmovaps 0x40(%1), %%ymm2 \n\t" - "vmovaps 0x60(%1), %%ymm3 \n\t" - "vbroadcastss 0x4(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm2, %%ymm0 \n\t" - "vfmadd231ps %%ymm5, %%ymm3, %%ymm1 \n\t" - - "prefetcht0 0x1C0(%1) \n\t" - - "vmovaps 0x80(%1), %%ymm2 \n\t" - "vmovaps 0xA0(%1), %%ymm3 \n\t" - "vbroadcastss 0x8(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm2, %%ymm0 \n\t" - "vfmadd231ps %%ymm5, %%ymm3, %%ymm1 \n\t" - - "prefetcht0 0x200(%1) \n\t" - - "vmovaps 0xC0(%1), %%ymm2 \n\t" - "vmovaps 0xE0(%1), %%ymm3 \n\t" - "vbroadcastss 0xC(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm2, %%ymm0 \n\t" - "vfmadd231ps %%ymm5, %%ymm3, %%ymm1 \n\t" - - "add $0x100, %1 \n\t" - "add $0x10, %2 \n\t" - - "sub $1, %%ecx \n\t" - "jg .k_loop_1x16 \n\t" - ".align 16 \n\t" - ".k_loop_1x16_end: \n\t" - - "mov %0, %%ecx \n\t" - "and $3, %%ecx \n\t" - "je .k_loop_1x16_remain_end \n\t" - - ".align 16 \n\t" - ".k_loop_1x16_remain: \n\t" - "vmovaps (%1), %%ymm2 \n\t" - "vmovaps 0x20(%1), %%ymm3 \n\t" - "vbroadcastss 0x0(%2), %%ymm5 \n\t" - "vfmadd231ps %%ymm5, %%ymm2, %%ymm0 \n\t" - "vfmadd231ps %%ymm5, %%ymm3, %%ymm1 \n\t" - "add $0x40, %1 \n\t" - "add $0x4, %2 \n\t" - "sub $1, %%ecx \n\t" - "jg .k_loop_1x16_remain \n\t" - - ".align 16 \n\t" - ".k_loop_1x16_remain_end: \n\t" - "prefetcht0 (%3, %%rax) \n\t" - "vaddps (%3), %%ymm0, %%ymm0 \n\t" - "vaddps 0x20(%3), %%ymm1, %%ymm1 \n\t" - "vmovups %%ymm0, (%3) \n\t" - "vmovups %%ymm1, 0x20(%3) \n\t" - : - : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) - : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm5", "memory"); -} - -void mmm_avx2_1x8_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) -{ - __asm__ __volatile__("mov %4, %%eax \n\t" - "shl $2, %%eax \n\t" - "mov %%eax, %%eax \n\t" - - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" - - "mov %0, %%ecx \n\t" - "shr $2, %%ecx \n\t" - "je .k_loop_1x8_end \n\t" - - ".align 16 \n\t" - ".k_loop_1x8: \n\t" - - "prefetcht0 0x140(%1) \n\t" - "vmovaps (%1), %%ymm1 \n\t" - "vbroadcastss (%2), %%ymm2 \n\t" - "vfmadd231ps %%ymm2, %%ymm1, %%ymm0 \n\t" - - "vmovaps 0x20(%1), %%ymm1 \n\t" - "vbroadcastss 0x4(%2), %%ymm2 \n\t" - "vfmadd231ps %%ymm2, %%ymm1, %%ymm0 \n\t" - - "prefetcht0 0x180(%1) \n\t" - "vmovaps 0x40(%1), %%ymm1 \n\t" - "vbroadcastss 0x8(%2), %%ymm2 \n\t" - "vfmadd231ps %%ymm2, %%ymm1, %%ymm0 \n\t" - - "vmovaps 0x60(%1), %%ymm1 \n\t" - "vbroadcastss 0xC(%2), %%ymm2 \n\t" - "vfmadd231ps %%ymm2, %%ymm1, %%ymm0 \n\t" - - "add $0x80, %1 \n\t" - "add $0x10, %2 \n\t" - - "sub $1, %%ecx \n\t" - "jg .k_loop_1x8 \n\t" - ".align 16 \n\t" - ".k_loop_1x8_end: \n\t" - - "mov %0, %%ecx \n\t" - "and $3, %%ecx \n\t" - "je .k_loop_1x8_remain_end \n\t" - - ".align 16 \n\t" - ".k_loop_1x8_remain: \n\t" - "vmovaps (%1), %%ymm1 \n\t" - "vbroadcastss (%2), %%ymm2 \n\t" - "vfmadd231ps %%ymm2, %%ymm1, %%ymm0 \n\t" - "add $0x20, %1 \n\t" - "add $0x4, %2 \n\t" - "sub $1, %%ecx \n\t" - "jg .k_loop_1x8_remain \n\t" - - ".align 16 \n\t" - ".k_loop_1x8_remain_end: \n\t" - - "vaddps (%3), %%ymm0, %%ymm0 \n\t" - "vmovups %%ymm0, (%3) \n\t" - : - : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) - : "%eax", "%rax", "%ecx", "%ymm0", "%ymm1", "%ymm2", "memory"); -} - -void mmm_avx2_1x4_asm(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) -{ - __asm__ __volatile__("mov %4, %%eax \n\t" - "shl $2, %%eax \n\t" - "mov %%eax, %%eax \n\t" - - "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" - - "mov %0, %%ecx \n\t" - "shr $2, %%ecx \n\t" - "je .k_loop_1x4_end \n\t" - ".align 16 \n\t" - ".k_loop_1x4: \n\t" - - "prefetcht0 0x40(%1) \n\t" - - "vmovaps (%1), %%xmm1 \n\t" - "vbroadcastss 0x0(%2), %%xmm2 \n\t" - "vfmadd231ps %%xmm2, %%xmm1, %%xmm0 \n\t" - - "vmovaps 0x10(%1), %%xmm1 \n\t" - "vbroadcastss 0x4(%2), %%xmm2 \n\t" - "vfmadd231ps %%xmm2, %%xmm1, %%xmm0 \n\t" - - "vmovaps 0x20(%1), %%xmm1 \n\t" - "vbroadcastss 0x8(%2), %%xmm2 \n\t" - "vfmadd231ps %%xmm2, %%xmm1, %%xmm0 \n\t" - - "vmovaps 0x30(%1), %%xmm1 \n\t" - "vbroadcastss 0xC(%2), %%xmm2 \n\t" - "vfmadd231ps %%xmm2, %%xmm1, %%xmm0 \n\t" - - "add $0x40, %1 \n\t" - "add $0x10, %2 \n\t" - - "sub $1, %%ecx \n\t" - "jg .k_loop_1x4 \n\t" - ".align 16 \n\t" - ".k_loop_1x4_end: \n\t" - - "mov %0, %%ecx \n\t" - "and $3, %%ecx \n\t" - "je .k_loop_1x4_remain_end \n\t" - - ".align 16 \n\t" - ".k_loop_1x4_remain: \n\t" - "vmovaps (%1), %%xmm1 \n\t" - "vbroadcastss 0x0(%2), %%xmm2 \n\t" - "vfmadd231ps %%xmm2, %%xmm1, %%xmm0 \n\t" - "add $0x10, %1 \n\t" - "add $0x4, %2 \n\t" - "sub $1, %%ecx \n\t" - "jg .k_loop_1x4_remain \n\t" - - ".align 16 \n\t" - ".k_loop_1x4_remain_end: \n\t" - - "vaddps (%3), %%xmm0, %%xmm0 \n\t" - "vmovups %%xmm0, (%3) \n\t" - "add %%rax, %3 \n\t" - : - : "r"(bk), "r"(matrixB), "r"(matrixA), "r"(matrixC), "r"(N) - : "%eax", "%rax", "%ecx", "%xmm0", "%xmm1", "%xmm2", "memory"); -} - -void mmm_avx2_n_mtail(U32 um, U32 un, U32 bk, F32 *matrixA, F32 *matrixB, F32 *matrixC, U32 N) -{ - for (U32 i = 0; i < um; ++i) { - for (U32 j = 0; j < un; ++j) { - for (U32 k = 0; k < bk; ++k) { - matrixC[i * N + j] += matrixA[k * um + i] * matrixB[k * un + j]; - } - } - } -} - EE mmm_avx2_fp32( int N, int M, int K, DataFormat matrix1Df, F32 *matrix1, F32 *matrix2, F32 *tmp, F32 *result) { // buffer addr algined to 32 F32 *packA = (F32 *)align_addr(tmp, 32); F32 *packB = (F32 *)align_addr(matrix2, 32); - kernel_func kernel[3][5] = { + kernel_func kernel[4][5] = { {mmm_avx2_n_mtail, mmm_avx2_1x4_asm, mmm_avx2_1x8_asm, mmm_avx2_1x16_asm, mmm_avx2_1x24_asm}, {mmm_avx2_n_mtail, mmm_avx2_2x4_asm, mmm_avx2_2x8_asm, mmm_avx2_2x16_asm, mmm_avx2_2x24_asm}, + {mmm_avx2_n_mtail, mmm_avx2_3x4_asm, mmm_avx2_3x8_asm, mmm_avx2_3x16_asm, mmm_avx2_3x24_asm}, {mmm_avx2_n_mtail, mmm_avx2_4x4_asm, mmm_avx2_4x8_asm, mmm_avx2_4x16_asm, mmm_avx2_4x24_asm}}; F32 unrollNSize[4] = {4, 8, 16, 24}; - F32 unrollMSize[3] = {1, 2, 4}; - I32 resN = N % 24; - I32 blockNNum = N / 24; - I32 edgeblockNSizeArray[5] = {0}; - for (U32 i = 0; resN > 0; ++i) { - U32 value = UNI_MIN(unrollNSize[resN >> 3], resN); - edgeblockNSizeArray[i] += value; - edgeblockNSizeArray[i + 1] = edgeblockNSizeArray[i]; - resN -= value; - blockNNum += 1; + F32 unrollMSize[4] = {1, 2, 3, 4}; + I32 resN = N % UNROLL_N; + I32 blockNNum = N / UNROLL_N + (resN > 0); + I32 edgeBlockNSizeIdx = (resN > 4) ? ((resN + 7) / 8) : 0; + I32 edgeBlockNSize = (resN > 0) ? unrollNSize[edgeBlockNSizeIdx] : 0; + I32 mask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; + if (resN != edgeBlockNSize) { + UNI_MEMSET(mask + resN % 8, 0, (edgeBlockNSize - resN) * 4); + } + I32 *maskPtr = (N % 4 != 0) ? mask : nullptr; + I32 alginedN = (blockNNum - 1) * UNROLL_N + edgeBlockNSize; + if (edgeBlockNSize == 0) { + alginedN += UNROLL_N; } + I32 blockNum = (M + 3) / 4 * blockNNum; + I32 mainBlockNum = (BOLCK_M_DIM + 3) / 4 * blockNNum; #ifdef _USE_OPENMP -#pragma omp parallel num_threads(OMP_NUM_THREADS) + int in_parallel = omp_in_parallel(); +#pragma omp parallel num_threads(OMP_NUM_THREADS) if (in_parallel == 0) { #endif - I32 blockSizeM = 0, blockSizeK = 0; + I32 blockSizeK = 0; for (int k = 0; k < K; k += blockSizeK) { blockSizeK = UNI_MIN(BOLCK_K_DIM, K - k); - for (int j = 0; j < M; j += blockSizeM) { - blockSizeM = UNI_MIN(BOLCK_M_DIM, M - j); - I32 blockMNum = blockSizeM / 4 + (blockSizeM % 4 + 1) / 2; -#ifdef _USE_OPENMP -#pragma omp for -#endif - for (I32 mIdx = 0; mIdx < blockMNum; ++mIdx) { - I32 m = mIdx * 4 - ((mIdx * 4) > blockSizeM) * 2; - I32 unrollSizeM = UNI_MIN(UNROLL_M, blockSizeM - m); - unrollSizeM = unrollMSize[unrollSizeM >> 1]; - - I32 blockSizeN = UNI_MIN(UNROLL_N, N); - blockSizeN = UNI_MIN(unrollNSize[blockSizeN >> 3], blockSizeN); - - F32 *curB = packB + k * N; - F32 *curA = packA + m * blockSizeK; - if (matrix1Df == DF_TRANSPOSE) { - matrix2_trans(unrollSizeM, blockSizeK, M, matrix1 + (j + m) + k * M, curA); - } else if (matrix1Df == DF_NORMAL) { - matrix1_trans(unrollSizeM, blockSizeK, K, matrix1 + k + (j + m) * K, curA); - } else if (matrix1Df == DF_NKN8) { - matrix2_trans_c8( - unrollSizeM, blockSizeK, M, matrix1 + (j + m) * 8 + k * M, curA); - } - kernel[unrollSizeM >> 1][(blockSizeN >> 3) + (blockSizeN > 3)]( - unrollSizeM, blockSizeN, blockSizeK, curA, curB, result + (m + j) * N, N); - } + if (matrix1Df == DF_TRANSPOSE) { + matrix1_trans(blockSizeK, M, M, matrix1 + k * M, packA); + } + #ifdef _USE_OPENMP #pragma omp for #endif - for (int mnIdx = blockMNum; mnIdx < blockNNum * blockMNum; ++mnIdx) { - I32 nIdx = mnIdx / blockMNum; - I32 n = nIdx * UNROLL_N; - if (n >= N) { - U32 idx = (n - N) / UNROLL_N; - CHECK_REQUIREMENT(idx <= 4); - n = N / UNROLL_N * UNROLL_N + edgeblockNSizeArray[idx]; - } - I32 blockSizeN = UNI_MIN(UNROLL_N, N - n); - blockSizeN = UNI_MIN(unrollNSize[blockSizeN >> 3], blockSizeN); - F32 *curB = packB + k * N + n * blockSizeK; - - I32 mIdx = mnIdx % blockMNum; - I32 m = mIdx * 4 - ((mIdx * 4) > blockSizeM) * 2; - I32 unrollSizeM = UNI_MIN(UNROLL_M, blockSizeM - m); - unrollSizeM = unrollMSize[unrollSizeM >> 1]; - kernel[unrollSizeM >> 1][(blockSizeN >> 3) + (blockSizeN > 3)](unrollSizeM, - blockSizeN, blockSizeK, packA + m * blockSizeK, curB, - result + (m + j) * N + n, N); + for (int mnIdx = 0; mnIdx < blockNum; ++mnIdx) { + I32 j = mnIdx / mainBlockNum * BOLCK_M_DIM; + I32 blockSizeM = UNI_MIN(BOLCK_M_DIM, M - j); + I32 blockMNum = (blockSizeM + 3) / 4; + + I32 n = (mnIdx % mainBlockNum) / blockMNum * UNROLL_N; + I32 blockSizeN = UNI_MAX(UNI_MIN(UNROLL_N, N - n), edgeBlockNSize); + F32 *curB = packB + k * alginedN + n * blockSizeK; + maskPtr = ((blockSizeN + n) > N) ? mask : nullptr; + + I32 m = ((mnIdx % mainBlockNum) % blockMNum) * UNROLL_M; + I32 unrollSizeM = UNI_MIN(UNROLL_M, blockSizeM - m); + + F32 *curA, *A1, *A2, *A3; + if (matrix1Df == DF_TRANSPOSE) { + curA = packA + m * blockSizeK; + A1 = curA + blockSizeK; + A2 = curA + 2 * blockSizeK; + A3 = curA + 3 * blockSizeK; + } else { + curA = matrix1 + k + (j + m) * K; + A1 = curA + K; + A2 = curA + 2 * K; + A3 = curA + 3 * K; } + + kernel[unrollSizeM - 1][(blockSizeN >> 3) + (blockSizeN > 3)](unrollSizeM, + blockSizeN, blockSizeK, curA, curB, result + (m + j) * N + n, N, maskPtr, + A1, A2, A3); } } #ifdef _USE_OPENMP diff --git a/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_pack.cpp b/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_pack.cpp index ab67bff0..9334496f 100644 --- a/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_pack.cpp +++ b/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_pack.cpp @@ -61,7 +61,7 @@ EE matrix_vector_multiply_transform_weight_fp32(TensorDesc desc, F32 *src, F32 * unrollSizeN = unrollSize[unrollSizeN / 16 - (unrollSizeN >= 48)]; if (N - un < unrollSizeN) { for (U32 k = 0; k < blockKSize; ++k) { - memcpy(packB + k * (N - un), src + (k + bk) * N + un, + UNI_MEMCPY(packB + k * (N - un), src + (k + bk) * N + un, (N - un) * sizeof(F32)); } packB += (N - un) * blockKSize; @@ -265,7 +265,8 @@ void mvm_pack_fp32(U32 numRows, U32 numColumns, F32 *packB, F32 *vector, F32 *re blockNum += 1; } #ifdef _USE_OPENMP -#pragma omp parallel num_threads(OMP_NUM_THREADS) + int in_parallel = omp_in_parallel(); +#pragma omp parallel num_threads(OMP_NUM_THREADS) if (in_parallel == 0) { #endif U32 private_blockKSize = 0; diff --git a/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_row.cpp b/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_row.cpp index c8fd8afd..77aeb1b8 100644 --- a/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_row.cpp +++ b/compute/blas_enhance/src/cpu/x86/fp32/mvm_avx2_row.cpp @@ -238,13 +238,13 @@ void mvm_row_avx_4_32(U32 bk, U32 lda, F32 *matrix, F32 *vector, F32 *result) ".align 16 \n\t" ".k_loop_remain_1_end: \n\t" - "vaddps (%3), %%xmm0, %%xmm0 \n\t" + "addss (%3), %%xmm0 \n\t" "vmovss %%xmm0, (%3) \n\t" - "vaddps 0x4(%3), %%xmm1, %%xmm1 \n\t" + "addss 0x4(%3), %%xmm1 \n\t" "vmovss %%xmm1, 0x4(%3) \n\t" - "vaddps 0x8(%3), %%xmm2, %%xmm2 \n\t" + "addss 0x8(%3), %%xmm2 \n\t" "vmovss %%xmm2, 0x8(%3) \n\t" - "vaddps 0xC(%3), %%xmm3, %%xmm3 \n\t" + "addss 0xC(%3), %%xmm3 \n\t" "vmovss %%xmm3, 0xC(%3) \n\t" : : "r"(bk), "r"(matrix), "r"(vector), "r"(result), "r"(lda) @@ -398,9 +398,9 @@ void mvm_row_avx_2_32(U32 bk, U32 lda, F32 *matrix, F32 *vector, F32 *result) ".align 16 \n\t" ".n2_k_loop_remain_1_end: \n\t" - "vaddps (%3), %%xmm0, %%xmm0 \n\t" + "addss (%3), %%xmm0 \n\t" "vmovss %%xmm0, (%3) \n\t" - "vaddps 0x4(%3), %%xmm1, %%xmm1 \n\t" + "addss 0x4(%3), %%xmm1 \n\t" "vmovss %%xmm1, 0x4(%3) \n\t" : : "r"(bk), "r"(matrix), "r"(vector), "r"(result), "r"(lda) @@ -513,7 +513,7 @@ void mvm_row_avx_1_32(U32 bk, U32 lda, F32 *matrix, F32 *vector, F32 *result) ".align 16 \n\t" ".n1_k_loop_remain_1_end: \n\t" - "vaddps (%3), %%xmm0, %%xmm0 \n\t" + "addss (%3), %%xmm0 \n\t" "vmovss %%xmm0, (%3) \n\t" : : "r"(bk), "r"(matrix), "r"(vector), "r"(result), "r"(lda) @@ -528,7 +528,8 @@ void mvm_row_fp32(U32 numRows, U32 numColumns, F32 *matrix, F32 *vector, F32 *re U32 unrollNSize[3] = {1, 2, 4}; U32 blockNum = numRows / 4 + (numRows % 4 + 1) / 2; #ifdef _USE_OPENMP -#pragma omp parallel num_threads(OMP_NUM_THREADS) + int in_parallel = omp_in_parallel(); +#pragma omp parallel num_threads(OMP_NUM_THREADS) if (in_parallel == 0) { #endif U32 private_blockKSize = 0; diff --git a/compute/blas_enhance/src/cpu/x86/int8/blas_int8.h b/compute/blas_enhance/src/cpu/x86/int8/blas_int8.h index 603b4d61..2f761711 100644 --- a/compute/blas_enhance/src/cpu/x86/int8/blas_int8.h +++ b/compute/blas_enhance/src/cpu/x86/int8/blas_int8.h @@ -21,15 +21,15 @@ #include "uni.h" #define SIMDW 8 -#define align_size(size, unit) ((size + unit - 1) / unit * unit) void matrix_matrix_multiply_tmp_bytes_int8( - U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes); + U32 row1, U32 col1, U32 row2, U32 col2, DataFormat df, DataType dt, U32 *bytes); // transform no-transposed B to K4, offline -inline void matrix1_trans_l(int size, int blockK, int K, int alignSize, INT8 *src, INT8 *dst) +inline void matrix1_trans_l( + int size, int alignedN, int blockK, int K, int alignSize, INT8 *src, INT8 *dst) { - int alignedBlockK = align_size(blockK, alignSize); + int alignedBlockK = UNI_ALIGN(blockK, alignSize); int blockKF32 = blockK / 4; __m256i vindex = _mm256_set_epi32(K * 7, K * 6, K * 5, K * 4, K * 3, K * 2, K, 0); int i; @@ -52,13 +52,18 @@ inline void matrix1_trans_l(int size, int blockK, int K, int alignSize, INT8 *sr } j *= 8; for (; j < size; ++j) { - memcpy(dst, src + i * 4 + j * K, 4); + UNI_MEMCPY(dst, src + i * 4 + j * K, 4); dst += 4; } + if (j < alignedN) { + UNI_MEMSET(dst, 0, 4 * (alignedN - size)); + dst += 4 * (alignedN - size); + } } i *= 4; for (; i < alignedBlockK; i += 4) { - for (int j = 0; j < size; ++j) { + int j = 0; + for (; j < size; ++j) { for (int ii = i; ii < i + 4; ++ii) { if (ii < blockK) { *(dst++) = src[ii + j * K]; @@ -67,15 +72,21 @@ inline void matrix1_trans_l(int size, int blockK, int K, int alignSize, INT8 *sr } } } + if (j < alignedN) { + UNI_MEMSET(dst, 0, 4 * (alignedN - size)); + dst += 4 * (alignedN - size); + } } } // transform transposed B to K4, offline -inline void matrix2_trans_l(int size, int blockK, int N, int alignSize, INT8 *src, INT8 *dst) +inline void matrix2_trans_l( + int size, int alignedN, int blockK, int N, int alignSize, INT8 *src, INT8 *dst) { - int alignedBlockK = align_size(blockK, alignSize); + int alignedBlockK = UNI_ALIGN(blockK, alignSize); for (int i = 0; i < alignedBlockK; i += 4) { - for (int j = 0; j < size; ++j) { + int j = 0; + for (; j < size; ++j) { for (int ii = i; ii < (i + 4); ++ii) { if (ii < blockK) { *(dst++) = src[ii * N + j]; @@ -84,6 +95,10 @@ inline void matrix2_trans_l(int size, int blockK, int N, int alignSize, INT8 *sr } } } + if (j < alignedN) { + UNI_MEMSET(dst, 0, 4 * (alignedN - size)); + dst += 4 * (alignedN - size); + } } } @@ -91,7 +106,7 @@ inline void matrix2_trans_l(int size, int blockK, int N, int alignSize, INT8 *sr inline void matrix2_trans_r(int size, int blockK, int M, int alignSize, UINT8 *src, UINT8 *dst) { // TODO: optimize - int alignedBlockK = align_size(blockK, alignSize); + int alignedBlockK = UNI_ALIGN(blockK, alignSize); for (int j = 0; j < size; ++j) { int i = 0; for (i = 0; i < blockK; ++i) { @@ -101,7 +116,7 @@ inline void matrix2_trans_r(int size, int blockK, int M, int alignSize, UINT8 *s *(dst++) = *(src + i * M + j); } for (; i < alignedBlockK; ++i) { - *(dst++) = 0; + *(dst++) = 128; } } } @@ -109,23 +124,21 @@ inline void matrix2_trans_r(int size, int blockK, int M, int alignSize, UINT8 *s // transpose A, online inline void matrix1_trans_r(int size, int blockK, int K, int alignSize, UINT8 *src, UINT8 *dst) { - int alignedBlockK = align_size(blockK, alignSize); + int alignedBlockK = UNI_ALIGN(blockK, alignSize); if (alignedBlockK != blockK) { - memset(dst, 0, alignedBlockK * size); + UNI_MEMSET(dst, 0, alignedBlockK * size); } for (int j = 0; j < size; ++j) { - memcpy(dst + j * alignedBlockK, src + j * K, blockK); + UNI_MEMCPY(dst + j * alignedBlockK, src + j * K, blockK); } } EE matrix_vector_multiply_transform_weight_int8( TensorDesc desc, INT8 *src, INT8 *packB, I32 *offsetCBias); -EE matrix_matrix_multiply_transform_rhsN_int8( - TensorDesc desc, INT8 *src, INT8 *dst, I32 *offsetCBias); +EE matrix_matrix_multiply_transform_rhsN_int8(TensorDesc desc, INT8 *src, INT8 *dst); -EE matrix_matrix_multiply_transform_rhsT_int8( - TensorDesc desc, INT8 *src, INT8 *dst, I32 *offsetCBias); +EE matrix_matrix_multiply_transform_rhsT_int8(TensorDesc desc, INT8 *src, INT8 *dst); EE mmm_avx512_vnni_int8(U32 M, U32 N, diff --git a/compute/blas_enhance/src/cpu/x86/int8/mmm_avx512_vnni.cpp b/compute/blas_enhance/src/cpu/x86/int8/mmm_avx512_vnni.cpp index 6dc580a0..d2a4a168 100644 --- a/compute/blas_enhance/src/cpu/x86/int8/mmm_avx512_vnni.cpp +++ b/compute/blas_enhance/src/cpu/x86/int8/mmm_avx512_vnni.cpp @@ -30,22 +30,2014 @@ typedef void (*kernel_func)(U32 um, U32 N, U32 stepK, const F32 *scale, + U32 nmask, + UINT8 *resK, U32 flags); +// clang-format off +#define loadOffset_1_1(rtype) \ + "vmovups (%[offset]), "#rtype"0 \n\t" + +#define loadOffset_6_1(rtype) \ + loadOffset_1_1(rtype) \ + "vmovups "#rtype"0, "#rtype"1 \n\t" \ + "vmovups "#rtype"0, "#rtype"2 \n\t" \ + "vmovups "#rtype"0, "#rtype"3 \n\t" \ + "vmovups "#rtype"0, "#rtype"4 \n\t" \ + "vmovups "#rtype"0, "#rtype"5 \n\t" + +#define loadOffset_12_1(rtype) \ + loadOffset_6_1(rtype) \ + "vmovups "#rtype"0, "#rtype"6 \n\t" \ + "vmovups "#rtype"0, "#rtype"7 \n\t" \ + "vmovups "#rtype"0, "#rtype"8 \n\t" \ + "vmovups "#rtype"0, "#rtype"9 \n\t" \ + "vmovups "#rtype"0, "#rtype"10 \n\t" \ + "vmovups "#rtype"0, "#rtype"11 \n\t" + +#define loadOffset_24_1(rtype) \ + loadOffset_12_1(rtype) \ + "vmovups "#rtype"0, "#rtype"12 \n\t" \ + "vmovups "#rtype"0, "#rtype"13 \n\t" \ + "vmovups "#rtype"0, "#rtype"14 \n\t" \ + "vmovups "#rtype"0, "#rtype"15 \n\t" \ + "vmovups "#rtype"0, "#rtype"16 \n\t" \ + "vmovups "#rtype"0, "#rtype"17 \n\t" \ + "vmovups "#rtype"0, "#rtype"18 \n\t" \ + "vmovups "#rtype"0, "#rtype"19 \n\t" \ + "vmovups "#rtype"0, "#rtype"20 \n\t" \ + "vmovups "#rtype"0, "#rtype"21 \n\t" \ + "vmovups "#rtype"0, "#rtype"22 \n\t" \ + "vmovups "#rtype"0, "#rtype"23 \n\t" + +#define loadOffset_1_2 \ + loadOffset_1_1(%%zmm) \ + "vmovups 0x40(%[offset]), %%zmm1 \n\t" + +#define loadOffset_3_2 \ + loadOffset_1_2 \ + "vmovups %%zmm0, %%zmm2 \n\t" \ + "vmovups %%zmm1, %%zmm3 \n\t" \ + "vmovups %%zmm0, %%zmm4 \n\t" \ + "vmovups %%zmm1, %%zmm5 \n\t" + +#define loadOffset_6_2 \ + loadOffset_3_2 \ + "vmovups %%zmm0, %%zmm6 \n\t" \ + "vmovups %%zmm1, %%zmm7 \n\t" \ + "vmovups %%zmm0, %%zmm8 \n\t" \ + "vmovups %%zmm1, %%zmm9 \n\t" \ + "vmovups %%zmm0, %%zmm10 \n\t" \ + "vmovups %%zmm1, %%zmm11 \n\t" + +#define loadOffset_12_2 \ + loadOffset_6_2 \ + "vmovups %%zmm0, %%zmm12 \n\t" \ + "vmovups %%zmm1, %%zmm13 \n\t" \ + "vmovups %%zmm0, %%zmm14 \n\t" \ + "vmovups %%zmm1, %%zmm15 \n\t" \ + "vmovups %%zmm0, %%zmm16 \n\t" \ + "vmovups %%zmm1, %%zmm17 \n\t" \ + "vmovups %%zmm0, %%zmm18 \n\t" \ + "vmovups %%zmm1, %%zmm19 \n\t" \ + "vmovups %%zmm0, %%zmm20 \n\t" \ + "vmovups %%zmm1, %%zmm21 \n\t" \ + "vmovups %%zmm0, %%zmm22 \n\t" \ + "vmovups %%zmm1, %%zmm23 \n\t" + +#define loadOffset_1_3 \ + loadOffset_1_2 \ + "vmovups 0x80(%[offset]), %%zmm2 \n\t" + +#define loadOffset_2_3 \ + loadOffset_1_3 \ + "vmovups %%zmm0, %%zmm3 \n\t" \ + "vmovups %%zmm1, %%zmm4 \n\t" \ + "vmovups %%zmm2, %%zmm5 \n\t" + +#define loadOffset_4_3 \ + loadOffset_2_3 \ + "vmovups %%zmm0, %%zmm6 \n\t" \ + "vmovups %%zmm1, %%zmm7 \n\t" \ + "vmovups %%zmm2, %%zmm8 \n\t" \ + "vmovups %%zmm0, %%zmm9 \n\t" \ + "vmovups %%zmm1, %%zmm10 \n\t" \ + "vmovups %%zmm2, %%zmm11 \n\t" + +#define loadOffset_8_3 \ + loadOffset_4_3 \ + "vmovups %%zmm0, %%zmm12 \n\t" \ + "vmovups %%zmm1, %%zmm13 \n\t" \ + "vmovups %%zmm2, %%zmm14 \n\t" \ + "vmovups %%zmm0, %%zmm15 \n\t" \ + "vmovups %%zmm1, %%zmm16 \n\t" \ + "vmovups %%zmm2, %%zmm17 \n\t" \ + "vmovups %%zmm0, %%zmm18 \n\t" \ + "vmovups %%zmm1, %%zmm19 \n\t" \ + "vmovups %%zmm2, %%zmm20 \n\t" \ + "vmovups %%zmm0, %%zmm21 \n\t" \ + "vmovups %%zmm1, %%zmm22 \n\t" \ + "vmovups %%zmm2, %%zmm23 \n\t" + +#define addC_1_1(rtype, C) \ + "movq "#C", %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"0, "#rtype"0 \n\t" + +#define addC_6_1(rtype, C) \ + addC_1_1(rtype, C) \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"1, "#rtype"1 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"2, "#rtype"2 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"3, "#rtype"3 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"4, "#rtype"4 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"5, "#rtype"5 \n\t" + +#define addC_12_1(rtype, C) \ + addC_6_1(rtype, C) \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"6, "#rtype"6 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"7, "#rtype"7 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"8, "#rtype"8 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"9, "#rtype"9 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"10, "#rtype"10 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"11, "#rtype"11 \n\t" + +#define addC_24_1(rtype, C) \ + addC_12_1(rtype, C) \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"12, "#rtype"12 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"13, "#rtype"13 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"14, "#rtype"14 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"15, "#rtype"15 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"16, "#rtype"16 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"17, "#rtype"17 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"18, "#rtype"18 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"19, "#rtype"19 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"20, "#rtype"20 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"21, "#rtype"21 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"22, "#rtype"22 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), "#rtype"23, "#rtype"23 \n\t" + +#define addC_1_2(C) \ + addC_1_1(%%zmm, C) \ + "vpaddd 0x40(%%rax), %%zmm1, %%zmm1 \n\t" + +#define addC_3_2(C) \ + addC_1_2(C) \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), %%zmm2, %%zmm2 \n\t" \ + "vpaddd 0x40(%%rax), %%zmm3, %%zmm3 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), %%zmm4, %%zmm4 \n\t" \ + "vpaddd 0x40(%%rax), %%zmm5, %%zmm5 \n\t" + +#define addC_6_2(C) \ + addC_3_2(C) \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), %%zmm6, %%zmm6 \n\t" \ + "vpaddd 0x40(%%rax), %%zmm7, %%zmm7 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), %%zmm8, %%zmm8 \n\t" \ + "vpaddd 0x40(%%rax), %%zmm9, %%zmm9 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), %%zmm10, %%zmm10 \n\t" \ + "vpaddd 0x40(%%rax), %%zmm11, %%zmm11 \n\t" + +#define addC_12_2(C) \ + addC_6_2(C) \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), %%zmm12, %%zmm12 \n\t" \ + "vpaddd 0x40(%%rax), %%zmm13, %%zmm13 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), %%zmm14, %%zmm14 \n\t" \ + "vpaddd 0x40(%%rax), %%zmm15, %%zmm15 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), %%zmm16, %%zmm16 \n\t" \ + "vpaddd 0x40(%%rax), %%zmm17, %%zmm17 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), %%zmm18, %%zmm18 \n\t" \ + "vpaddd 0x40(%%rax), %%zmm19, %%zmm19 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), %%zmm20, %%zmm20 \n\t" \ + "vpaddd 0x40(%%rax), %%zmm21, %%zmm21 \n\t" \ + "addq %[N], %%rax \n\t" \ + "vpaddd (%%rax), %%zmm22, %%zmm22 \n\t" \ + "vpaddd 0x40(%%rax), %%zmm23, %%zmm23 \n\t" + +#define addC_1_3(C) \ + "vpaddd ("#C"), %%zmm0, %%zmm0 \n\t" \ + "vpaddd 0x40("#C"), %%zmm1, %%zmm1 \n\t" \ + "vpaddd 0x80("#C"), %%zmm2, %%zmm2 \n\t" + +#define addC_2_3(C) \ + "vpaddd ("#C"), %%zmm0, %%zmm0 \n\t" \ + "vpaddd 0x40("#C"), %%zmm1, %%zmm1 \n\t" \ + "vpaddd 0x80("#C"), %%zmm2, %%zmm2 \n\t" \ + "vpaddd ("#C", %[N]), %%zmm3, %%zmm3 \n\t" \ + "vpaddd 0x40("#C", %[N]), %%zmm4, %%zmm4 \n\t" \ + "vpaddd 0x80("#C", %[N]), %%zmm5, %%zmm5 \n\t" + +#define addC_4_3(C) \ + addC_2_3(C) \ + "addq %%rcx, "#C" \n\t" \ + "vpaddd ("#C"), %%zmm6, %%zmm6 \n\t" \ + "vpaddd 0x40("#C"), %%zmm7, %%zmm7 \n\t" \ + "vpaddd 0x80("#C"), %%zmm8, %%zmm8 \n\t" \ + "vpaddd ("#C", %[N]), %%zmm9, %%zmm9 \n\t" \ + "vpaddd 0x40("#C", %[N]), %%zmm10, %%zmm10 \n\t" \ + "vpaddd 0x80("#C", %[N]), %%zmm11, %%zmm11 \n\t" + +#define addC_8_3(C) \ + addC_4_3(C) \ + "addq %%rcx, "#C" \n\t" \ + "vpaddd ("#C"), %%zmm12, %%zmm12 \n\t" \ + "vpaddd 0x40("#C"), %%zmm13, %%zmm13 \n\t" \ + "vpaddd 0x80("#C"), %%zmm14, %%zmm14 \n\t" \ + "vpaddd ("#C", %[N]), %%zmm15, %%zmm15 \n\t" \ + "vpaddd 0x40("#C", %[N]), %%zmm16, %%zmm16 \n\t" \ + "vpaddd 0x80("#C", %[N]), %%zmm17, %%zmm17 \n\t" \ + "addq %%rcx, "#C" \n\t" \ + "vpaddd ("#C"), %%zmm18, %%zmm18 \n\t" \ + "vpaddd 0x40("#C"), %%zmm19, %%zmm19 \n\t" \ + "vpaddd 0x80("#C"), %%zmm20, %%zmm20 \n\t" \ + "vpaddd ("#C", %[N]), %%zmm21, %%zmm21 \n\t" \ + "vpaddd 0x40("#C", %[N]), %%zmm22, %%zmm22 \n\t" \ + "vpaddd 0x80("#C", %[N]), %%zmm23, %%zmm23 \n\t" \ + +#define storeC_1_1_0(op, rtype, C, off0, off1) \ + "movq "#C", %%rax \n\t" \ + #op" "#rtype"0, (%%rax) \n\t" + +#define storeC_2_1_0(op, rtype, C, off0, off1) \ + storeC_1_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"1, (%%rax) \n\t" + +#define storeC_3_1_0(op, rtype, C, off0, off1) \ + storeC_2_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"2, (%%rax) \n\t" + +#define storeC_4_1_0(op, rtype, C, off0, off1) \ + storeC_3_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"3, (%%rax) \n\t" + +#define storeC_5_1_0(op, rtype, C, off0, off1) \ + storeC_4_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"4, (%%rax) \n\t" + +#define storeC_6_1_0(op, rtype, C, off0, off1) \ + storeC_5_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"5, (%%rax) \n\t" + +#define storeC_7_1_0(op, rtype, C, off0, off1) \ + storeC_6_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"6, (%%rax) \n\t" + +#define storeC_8_1_0(op, rtype, C, off0, off1) \ + storeC_7_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"7, (%%rax) \n\t" + +#define storeC_9_1_0(op, rtype, C, off0, off1) \ + storeC_8_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"8, (%%rax) \n\t" + +#define storeC_10_1_0(op, rtype, C, off0, off1) \ + storeC_9_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"9, (%%rax) \n\t" + +#define storeC_11_1_0(op, rtype, C, off0, off1) \ + storeC_10_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"10, (%%rax) \n\t" + +#define storeC_12_1_0(op, rtype, C, off0, off1) \ + storeC_11_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"11, (%%rax) \n\t" + +#define storeC_13_1_0(op, rtype, C, off0, off1) \ + storeC_12_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"12, (%%rax) \n\t" + +#define storeC_14_1_0(op, rtype, C, off0, off1) \ + storeC_13_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"13, (%%rax) \n\t" + +#define storeC_15_1_0(op, rtype, C, off0, off1) \ + storeC_14_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"14, (%%rax) \n\t" + +#define storeC_16_1_0(op, rtype, C, off0, off1) \ + storeC_15_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"15, (%%rax) \n\t" + +#define storeC_17_1_0(op, rtype, C, off0, off1) \ + storeC_16_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"16, (%%rax) \n\t" + +#define storeC_18_1_0(op, rtype, C, off0, off1) \ + storeC_17_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"17, (%%rax) \n\t" + +#define storeC_19_1_0(op, rtype, C, off0, off1) \ + storeC_18_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"18, (%%rax) \n\t" + +#define storeC_20_1_0(op, rtype, C, off0, off1) \ + storeC_19_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"19, (%%rax) \n\t" + +#define storeC_21_1_0(op, rtype, C, off0, off1) \ + storeC_20_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"20, (%%rax) \n\t" + +#define storeC_22_1_0(op, rtype, C, off0, off1) \ + storeC_21_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"21, (%%rax) \n\t" + +#define storeC_23_1_0(op, rtype, C, off0, off1) \ + storeC_22_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"22, (%%rax) \n\t" + +#define storeC_24_1_0(op, rtype, C, off0, off1) \ + storeC_23_1_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"23, (%%rax) \n\t" + +#define storeC_1_2_0(op, rtype, C, off0, off1) \ + storeC_1_1_0(op, rtype, C, off0, off1) \ + #op" "#rtype"1, "#off0"(%%rax) \n\t" + +#define storeC_2_2_0(op, rtype, C, off0, off1) \ + storeC_1_2_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"2, (%%rax) \n\t" \ + #op" "#rtype"3, "#off0"(%%rax) \n\t" + +#define storeC_3_2_0(op, rtype, C, off0, off1) \ + storeC_2_2_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"4, (%%rax) \n\t" \ + #op" "#rtype"5, "#off0"(%%rax) \n\t" + +#define storeC_4_2_0(op, rtype, C, off0, off1) \ + storeC_3_2_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"6, (%%rax) \n\t" \ + #op" "#rtype"7, "#off0"(%%rax) \n\t" + +#define storeC_5_2_0(op, rtype, C, off0, off1) \ + storeC_4_2_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"8, (%%rax) \n\t" \ + #op" "#rtype"9, "#off0"(%%rax) \n\t" + +#define storeC_6_2_0(op, rtype, C, off0, off1) \ + storeC_5_2_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"10, (%%rax) \n\t" \ + #op" "#rtype"11, "#off0"(%%rax) \n\t" + +#define storeC_7_2_0(op, rtype, C, off0, off1) \ + storeC_6_2_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"12, (%%rax) \n\t" \ + #op" "#rtype"13, "#off0"(%%rax) \n\t" + +#define storeC_8_2_0(op, rtype, C, off0, off1) \ + storeC_7_2_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"14, (%%rax) \n\t" \ + #op" "#rtype"15, "#off0"(%%rax) \n\t" + +#define storeC_9_2_0(op, rtype, C, off0, off1) \ + storeC_8_2_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"16, (%%rax) \n\t" \ + #op" "#rtype"17, "#off0"(%%rax) \n\t" + +#define storeC_10_2_0(op, rtype, C, off0, off1) \ + storeC_9_2_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"18, (%%rax) \n\t" \ + #op" "#rtype"19, "#off0"(%%rax) \n\t" + +#define storeC_11_2_0(op, rtype, C, off0, off1) \ + storeC_10_2_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"20, (%%rax) \n\t" \ + #op" "#rtype"21, "#off0"(%%rax) \n\t" + +#define storeC_12_2_0(op, rtype, C, off0, off1) \ + storeC_11_2_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"22, (%%rax) \n\t" \ + #op" "#rtype"23, "#off0"(%%rax) \n\t" + +#define storeC_1_3_0(op, rtype, C, off0, off1) \ + "movq "#C", %%rax \n\t" \ + #op" "#rtype"0, (%%rax) \n\t" \ + #op" "#rtype"1, "#off0"(%%rax) \n\t" \ + #op" "#rtype"2, "#off1"(%%rax) \n\t" + +#define storeC_2_3_0(op, rtype, C, off0, off1) \ + storeC_1_3_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"3, (%%rax) \n\t" \ + #op" "#rtype"4, "#off0"(%%rax) \n\t" \ + #op" "#rtype"5, "#off1"(%%rax) \n\t" + +#define storeC_3_3_0(op, rtype, C, off0, off1) \ + storeC_2_3_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"6, (%%rax) \n\t" \ + #op" "#rtype"7, "#off0"(%%rax) \n\t" \ + #op" "#rtype"8, "#off1"(%%rax) \n\t" + +#define storeC_4_3_0(op, rtype, C, off0, off1) \ + storeC_3_3_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"9, (%%rax) \n\t" \ + #op" "#rtype"10, "#off0"(%%rax) \n\t" \ + #op" "#rtype"11, "#off1"(%%rax) \n\t" + +#define storeC_5_3_0(op, rtype, C, off0, off1) \ + storeC_4_3_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"12, (%%rax) \n\t" \ + #op" "#rtype"13, "#off0"(%%rax) \n\t" \ + #op" "#rtype"14, "#off1"(%%rax) \n\t" + +#define storeC_6_3_0(op, rtype, C, off0, off1) \ + storeC_5_3_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"15, (%%rax) \n\t" \ + #op" "#rtype"16, "#off0"(%%rax) \n\t" \ + #op" "#rtype"17, "#off1"(%%rax) \n\t" + +#define storeC_7_3_0(op, rtype, C, off0, off1) \ + storeC_6_3_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"18, (%%rax) \n\t" \ + #op" "#rtype"19, "#off0"(%%rax) \n\t" \ + #op" "#rtype"20, "#off1"(%%rax) \n\t" + +#define storeC_8_3_0(op, rtype, C, off0, off1) \ + storeC_7_3_0(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"21, (%%rax) \n\t" \ + #op" "#rtype"22, "#off0"(%%rax) \n\t" \ + #op" "#rtype"23, "#off1"(%%rax) \n\t" + +#define storeC_1_1_1(op, rtype, C, off0, off1) \ + "movq "#C", %%rax \n\t" \ + "kmovw %[nmask], %%k1 \n\t" \ + #op" "#rtype"0, (%%rax) %{%%k1%} \n\t" + +#define storeC_2_1_1(op, rtype, C, off0, off1) \ + storeC_1_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"1, (%%rax) %{%%k1%} \n\t" + +#define storeC_3_1_1(op, rtype, C, off0, off1) \ + storeC_2_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"2, (%%rax) %{%%k1%} \n\t" + +#define storeC_4_1_1(op, rtype, C, off0, off1) \ + storeC_3_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"3, (%%rax) %{%%k1%} \n\t" + +#define storeC_5_1_1(op, rtype, C, off0, off1) \ + storeC_4_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"4, (%%rax) %{%%k1%} \n\t" + +#define storeC_6_1_1(op, rtype, C, off0, off1) \ + storeC_5_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"5, (%%rax) %{%%k1%} \n\t" + +#define storeC_7_1_1(op, rtype, C, off0, off1) \ + storeC_6_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"6, (%%rax) %{%%k1%} \n\t" + +#define storeC_8_1_1(op, rtype, C, off0, off1) \ + storeC_7_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"7, (%%rax) %{%%k1%} \n\t" + +#define storeC_9_1_1(op, rtype, C, off0, off1) \ + storeC_8_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"8, (%%rax) %{%%k1%} \n\t" + +#define storeC_10_1_1(op, rtype, C, off0, off1) \ + storeC_9_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"9, (%%rax) %{%%k1%} \n\t" + +#define storeC_11_1_1(op, rtype, C, off0, off1) \ + storeC_10_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"10, (%%rax) %{%%k1%} \n\t" + +#define storeC_12_1_1(op, rtype, C, off0, off1) \ + storeC_11_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"11, (%%rax) %{%%k1%} \n\t" + +#define storeC_13_1_1(op, rtype, C, off0, off1) \ + storeC_12_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"12, (%%rax) %{%%k1%} \n\t" + +#define storeC_14_1_1(op, rtype, C, off0, off1) \ + storeC_13_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"13, (%%rax) %{%%k1%} \n\t" + +#define storeC_15_1_1(op, rtype, C, off0, off1) \ + storeC_14_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"14, (%%rax) %{%%k1%} \n\t" + +#define storeC_16_1_1(op, rtype, C, off0, off1) \ + storeC_15_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"15, (%%rax) %{%%k1%} \n\t" + +#define storeC_17_1_1(op, rtype, C, off0, off1) \ + storeC_16_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"16, (%%rax) %{%%k1%} \n\t" + +#define storeC_18_1_1(op, rtype, C, off0, off1) \ + storeC_17_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"17, (%%rax) %{%%k1%} \n\t" + +#define storeC_19_1_1(op, rtype, C, off0, off1) \ + storeC_18_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"18, (%%rax) %{%%k1%} \n\t" + +#define storeC_20_1_1(op, rtype, C, off0, off1) \ + storeC_19_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"19, (%%rax) %{%%k1%} \n\t" + +#define storeC_21_1_1(op, rtype, C, off0, off1) \ + storeC_20_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"20, (%%rax) %{%%k1%} \n\t" + +#define storeC_22_1_1(op, rtype, C, off0, off1) \ + storeC_21_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"21, (%%rax) %{%%k1%} \n\t" + +#define storeC_23_1_1(op, rtype, C, off0, off1) \ + storeC_22_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"22, (%%rax) %{%%k1%} \n\t" + +#define storeC_24_1_1(op, rtype, C, off0, off1) \ + storeC_23_1_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"23, (%%rax) %{%%k1%} \n\t" + +#define storeC_1_2_1(op, rtype, C, off0, off1) \ + "kmovw %[nmask], %%k1 \n\t" \ + storeC_1_1_0(op, rtype, C, off0, off1) \ + #op" "#rtype"1, "#off0"(%%rax) %{%%k1%} \n\t" + +#define storeC_2_2_1(op, rtype, C, off0, off1) \ + storeC_1_2_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"2, (%%rax) \n\t" \ + #op" "#rtype"3, "#off0"(%%rax) %{%%k1%} \n\t" + +#define storeC_3_2_1(op, rtype, C, off0, off1) \ + storeC_2_2_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"4, (%%rax) \n\t" \ + #op" "#rtype"5, "#off0"(%%rax) %{%%k1%} \n\t" + +#define storeC_4_2_1(op, rtype, C, off0, off1) \ + storeC_3_2_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"6, (%%rax) \n\t" \ + #op" "#rtype"7, "#off0"(%%rax) %{%%k1%} \n\t" + +#define storeC_5_2_1(op, rtype, C, off0, off1) \ + storeC_4_2_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"8, (%%rax) \n\t" \ + #op" "#rtype"9, "#off0"(%%rax) %{%%k1%} \n\t" + +#define storeC_6_2_1(op, rtype, C, off0, off1) \ + storeC_5_2_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"10, (%%rax) \n\t" \ + #op" "#rtype"11, "#off0"(%%rax) %{%%k1%} \n\t" + +#define storeC_7_2_1(op, rtype, C, off0, off1) \ + storeC_6_2_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"12, (%%rax) \n\t" \ + #op" "#rtype"13, "#off0"(%%rax) %{%%k1%} \n\t" + +#define storeC_8_2_1(op, rtype, C, off0, off1) \ + storeC_7_2_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"14, (%%rax) \n\t" \ + #op" "#rtype"15, "#off0"(%%rax) %{%%k1%} \n\t" + +#define storeC_9_2_1(op, rtype, C, off0, off1) \ + storeC_8_2_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"16, (%%rax) \n\t" \ + #op" "#rtype"17, "#off0"(%%rax) %{%%k1%} \n\t" + +#define storeC_10_2_1(op, rtype, C, off0, off1) \ + storeC_9_2_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"18, (%%rax) \n\t" \ + #op" "#rtype"19, "#off0"(%%rax) %{%%k1%} \n\t" + +#define storeC_11_2_1(op, rtype, C, off0, off1) \ + storeC_10_2_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"20, (%%rax) \n\t" \ + #op" "#rtype"21, "#off0"(%%rax) %{%%k1%} \n\t" + +#define storeC_12_2_1(op, rtype, C, off0, off1) \ + storeC_11_2_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"22, (%%rax) \n\t" \ + #op" "#rtype"23, "#off0"(%%rax) %{%%k1%} \n\t" + +#define storeC_1_3_1(op, rtype, C, off0, off1) \ + "kmovw %[nmask], %%k1 \n\t" \ + storeC_1_2_0(op, rtype, C, off0, off1) \ + #op" "#rtype"2, "#off1"(%%rax) %{%%k1%} \n\t" + +#define storeC_2_3_1(op, rtype, C, off0, off1) \ + storeC_1_3_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"3, (%%rax) \n\t" \ + #op" "#rtype"4, "#off0"(%%rax) \n\t" \ + #op" "#rtype"5, "#off1"(%%rax) %{%%k1%} \n\t" + +#define storeC_3_3_1(op, rtype, C, off0, off1) \ + storeC_2_3_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"6, (%%rax) \n\t" \ + #op" "#rtype"7, "#off0"(%%rax) \n\t" \ + #op" "#rtype"8, "#off1"(%%rax) %{%%k1%} \n\t" + +#define storeC_4_3_1(op, rtype, C, off0, off1) \ + storeC_3_3_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"9, (%%rax) \n\t" \ + #op" "#rtype"10, "#off0"(%%rax) \n\t" \ + #op" "#rtype"11, "#off1"(%%rax) %{%%k1%} \n\t" + +#define storeC_5_3_1(op, rtype, C, off0, off1) \ + storeC_4_3_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"12, (%%rax) \n\t" \ + #op" "#rtype"13, "#off0"(%%rax) \n\t" \ + #op" "#rtype"14, "#off1"(%%rax) %{%%k1%} \n\t" + +#define storeC_6_3_1(op, rtype, C, off0, off1) \ + storeC_5_3_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"15, (%%rax) \n\t" \ + #op" "#rtype"16, "#off0"(%%rax) \n\t" \ + #op" "#rtype"17, "#off1"(%%rax) %{%%k1%} \n\t" + +#define storeC_7_3_1(op, rtype, C, off0, off1) \ + storeC_6_3_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"18, (%%rax) \n\t" \ + #op" "#rtype"19, "#off0"(%%rax) \n\t" \ + #op" "#rtype"20, "#off1"(%%rax) %{%%k1%} \n\t" + +#define storeC_8_3_1(op, rtype, C, off0, off1) \ + storeC_7_3_1(op, rtype, C, off0, off1) \ + "addq %[N], %%rax \n\t" \ + #op" "#rtype"21, (%%rax) \n\t" \ + #op" "#rtype"22, "#off0"(%%rax) \n\t" \ + #op" "#rtype"23, "#off1"(%%rax) %{%%k1%} \n\t" + +#define clear1Regs(rtype) \ + "vxorps "#rtype"0, "#rtype"0, "#rtype"0 \n\t" + +#define clear2Regs(rtype) \ + clear1Regs(rtype) \ + "vxorps "#rtype"1, "#rtype"1, "#rtype"1 \n\t" + +#define clear3Regs(rtype) \ + clear2Regs(rtype) \ + "vxorps "#rtype"2, "#rtype"2, "#rtype"2 \n\t" + +#define clear4Regs(rtype) \ + clear3Regs(rtype) \ + "vxorps "#rtype"3, "#rtype"3, "#rtype"3 \n\t" + +#define clear6Regs(rtype) \ + clear4Regs(rtype) \ + "vxorps "#rtype"4, "#rtype"4, "#rtype"4 \n\t" \ + "vxorps "#rtype"5, "#rtype"5, "#rtype"5 \n\t" + +#define clear8Regs(rtype) \ + clear6Regs(rtype) \ + "vxorps "#rtype"6, "#rtype"6, "#rtype"6 \n\t" \ + "vxorps "#rtype"7, "#rtype"7, "#rtype"7 \n\t" + +#define clear9Regs(rtype) \ + clear8Regs(rtype) \ + "vxorps "#rtype"8, "#rtype"8, "#rtype"8 \n\t" + +#define clear12Regs(rtype) \ + clear9Regs(rtype) \ + "vxorps "#rtype"9, "#rtype"9, "#rtype"9 \n\t" \ + "vxorps "#rtype"10, "#rtype"10, "#rtype"10 \n\t" \ + "vxorps "#rtype"11, "#rtype"11, "#rtype"11 \n\t" + +#define clear24Regs(rtype) \ + clear12Regs(rtype) \ + "vxorps "#rtype"12, "#rtype"12, "#rtype"12 \n\t" \ + "vxorps "#rtype"13, "#rtype"13, "#rtype"13 \n\t" \ + "vxorps "#rtype"14, "#rtype"14, "#rtype"14 \n\t" \ + "vxorps "#rtype"15, "#rtype"15, "#rtype"15 \n\t" \ + "vxorps "#rtype"16, "#rtype"16, "#rtype"16 \n\t" \ + "vxorps "#rtype"17, "#rtype"17, "#rtype"17 \n\t" \ + "vxorps "#rtype"18, "#rtype"18, "#rtype"18 \n\t" \ + "vxorps "#rtype"19, "#rtype"19, "#rtype"19 \n\t" \ + "vxorps "#rtype"20, "#rtype"20, "#rtype"20 \n\t" \ + "vxorps "#rtype"21, "#rtype"21, "#rtype"21 \n\t" \ + "vxorps "#rtype"22, "#rtype"22, "#rtype"22 \n\t" \ + "vxorps "#rtype"23, "#rtype"23, "#rtype"23 \n\t" + +#define convert1I32Regs2Ps(rtype, sReg) \ + "vbroadcastss ("#sReg"), "#rtype"24 \n\t" \ + "vcvtdq2ps "#rtype"0, "#rtype"0 \n\t" \ + "vmulps "#rtype"0, "#rtype"24, "#rtype"0 \n\t" + +#define convert2I32Regs2Ps(rtype, sReg) \ + "vbroadcastss ("#sReg"), "#rtype"24 \n\t" \ + "vcvtdq2ps "#rtype"0, "#rtype"0 \n\t" \ + "vcvtdq2ps "#rtype"1, "#rtype"1 \n\t" \ + "vmulps "#rtype"0, "#rtype"24, "#rtype"0 \n\t" \ + "vmulps "#rtype"1, "#rtype"24, "#rtype"1 \n\t" + +#define convert3I32Regs2Ps(rtype, sReg) \ + "vbroadcastss ("#sReg"), "#rtype"24 \n\t" \ + "vcvtdq2ps "#rtype"0, "#rtype"0 \n\t" \ + "vcvtdq2ps "#rtype"1, "#rtype"1 \n\t" \ + "vcvtdq2ps "#rtype"2, "#rtype"2 \n\t" \ + "vmulps "#rtype"0, "#rtype"24, "#rtype"0 \n\t" \ + "vmulps "#rtype"1, "#rtype"24, "#rtype"1 \n\t" \ + "vmulps "#rtype"2, "#rtype"24, "#rtype"2 \n\t" + +#define convert4I32Regs2Ps(rtype, sReg) \ + "vbroadcastss ("#sReg"), "#rtype"24 \n\t" \ + "vcvtdq2ps "#rtype"0, "#rtype"0 \n\t" \ + "vcvtdq2ps "#rtype"1, "#rtype"1 \n\t" \ + "vcvtdq2ps "#rtype"2, "#rtype"2 \n\t" \ + "vcvtdq2ps "#rtype"3, "#rtype"3 \n\t" \ + "vmulps "#rtype"0, "#rtype"24, "#rtype"0 \n\t" \ + "vmulps "#rtype"1, "#rtype"24, "#rtype"1 \n\t" \ + "vmulps "#rtype"2, "#rtype"24, "#rtype"2 \n\t" \ + "vmulps "#rtype"3, "#rtype"24, "#rtype"3 \n\t" + +#define convert6I32Regs2Ps(rtype, sReg) \ + "vbroadcastss ("#sReg"), "#rtype"24 \n\t" \ + "vcvtdq2ps "#rtype"0, "#rtype"0 \n\t" \ + "vcvtdq2ps "#rtype"1, "#rtype"1 \n\t" \ + "vcvtdq2ps "#rtype"2, "#rtype"2 \n\t" \ + "vcvtdq2ps "#rtype"3, "#rtype"3 \n\t" \ + "vcvtdq2ps "#rtype"4, "#rtype"4 \n\t" \ + "vcvtdq2ps "#rtype"5, "#rtype"5 \n\t" \ + "vmulps "#rtype"0, "#rtype"24, "#rtype"0 \n\t" \ + "vmulps "#rtype"1, "#rtype"24, "#rtype"1 \n\t" \ + "vmulps "#rtype"2, "#rtype"24, "#rtype"2 \n\t" \ + "vmulps "#rtype"3, "#rtype"24, "#rtype"3 \n\t" \ + "vmulps "#rtype"4, "#rtype"24, "#rtype"4 \n\t" \ + "vmulps "#rtype"5, "#rtype"24, "#rtype"5 \n\t" + +#define convert12I32Regs2Ps(rtype, sReg) \ + convert6I32Regs2Ps(rtype, sReg) \ + "vcvtdq2ps "#rtype"6, "#rtype"6 \n\t" \ + "vcvtdq2ps "#rtype"7, "#rtype"7 \n\t" \ + "vcvtdq2ps "#rtype"8, "#rtype"8 \n\t" \ + "vcvtdq2ps "#rtype"9, "#rtype"9 \n\t" \ + "vcvtdq2ps "#rtype"10, "#rtype"10 \n\t" \ + "vcvtdq2ps "#rtype"11, "#rtype"11 \n\t" \ + "vmulps "#rtype"6, "#rtype"24, "#rtype"6 \n\t" \ + "vmulps "#rtype"7, "#rtype"24, "#rtype"7 \n\t" \ + "vmulps "#rtype"8, "#rtype"24, "#rtype"8 \n\t" \ + "vmulps "#rtype"9, "#rtype"24, "#rtype"9 \n\t" \ + "vmulps "#rtype"10, "#rtype"24, "#rtype"10 \n\t" \ + "vmulps "#rtype"11, "#rtype"24, "#rtype"11 \n\t" + +#define convert24I32Regs2Ps(rtype, sReg) \ + convert12I32Regs2Ps(rtype, sReg) \ + "vcvtdq2ps "#rtype"12, "#rtype"12 \n\t" \ + "vcvtdq2ps "#rtype"13, "#rtype"13 \n\t" \ + "vcvtdq2ps "#rtype"14, "#rtype"14 \n\t" \ + "vcvtdq2ps "#rtype"15, "#rtype"15 \n\t" \ + "vcvtdq2ps "#rtype"16, "#rtype"16 \n\t" \ + "vcvtdq2ps "#rtype"17, "#rtype"17 \n\t" \ + "vcvtdq2ps "#rtype"18, "#rtype"18 \n\t" \ + "vcvtdq2ps "#rtype"19, "#rtype"19 \n\t" \ + "vcvtdq2ps "#rtype"20, "#rtype"20 \n\t" \ + "vcvtdq2ps "#rtype"21, "#rtype"21 \n\t" \ + "vcvtdq2ps "#rtype"22, "#rtype"22 \n\t" \ + "vcvtdq2ps "#rtype"23, "#rtype"23 \n\t" \ + "vmulps "#rtype"12, "#rtype"24, "#rtype"12 \n\t" \ + "vmulps "#rtype"13, "#rtype"24, "#rtype"13 \n\t" \ + "vmulps "#rtype"14, "#rtype"24, "#rtype"14 \n\t" \ + "vmulps "#rtype"15, "#rtype"24, "#rtype"15 \n\t" \ + "vmulps "#rtype"16, "#rtype"24, "#rtype"16 \n\t" \ + "vmulps "#rtype"17, "#rtype"24, "#rtype"17 \n\t" \ + "vmulps "#rtype"18, "#rtype"24, "#rtype"18 \n\t" \ + "vmulps "#rtype"19, "#rtype"24, "#rtype"19 \n\t" \ + "vmulps "#rtype"20, "#rtype"24, "#rtype"20 \n\t" \ + "vmulps "#rtype"21, "#rtype"24, "#rtype"21 \n\t" \ + "vmulps "#rtype"22, "#rtype"24, "#rtype"22 \n\t" \ + "vmulps "#rtype"23, "#rtype"24, "#rtype"23 \n\t" + +#define convert1PsRegs2U8(rtype) \ + "mov $128, %%eax \n\t" \ + "vmovd %%eax, %%xmm25 \n\t" \ + "vbroadcastss %%xmm25, "#rtype"24 \n\t" \ + "vcvtps2dq "#rtype"0, "#rtype"0 \n\t" \ + "vpaddd "#rtype"0, "#rtype"24, "#rtype"0 \n\t" + +#define convert2PsRegs2U8(rtype) \ + "mov $128, %%eax \n\t" \ + "vmovd %%eax, %%xmm25 \n\t" \ + "vbroadcastss %%xmm25, "#rtype"24 \n\t" \ + "vcvtps2dq "#rtype"0, "#rtype"0 \n\t" \ + "vcvtps2dq "#rtype"1, "#rtype"1 \n\t" \ + "vpaddd "#rtype"0, "#rtype"24, "#rtype"0 \n\t" \ + "vpaddd "#rtype"1, "#rtype"24, "#rtype"1 \n\t" + +#define convert3PsRegs2U8(rtype) \ + "mov $128, %%eax \n\t" \ + "vmovd %%eax, %%xmm25 \n\t" \ + "vbroadcastss %%xmm25, "#rtype"24 \n\t" \ + "vcvtps2dq "#rtype"0, "#rtype"0 \n\t" \ + "vcvtps2dq "#rtype"1, "#rtype"1 \n\t" \ + "vcvtps2dq "#rtype"2, "#rtype"2 \n\t" \ + "vpaddd "#rtype"0, "#rtype"24, "#rtype"0 \n\t" \ + "vpaddd "#rtype"1, "#rtype"24, "#rtype"1 \n\t" \ + "vpaddd "#rtype"2, "#rtype"24, "#rtype"2 \n\t" + +#define convert4PsRegs2U8(rtype) \ + "mov $128, %%eax \n\t" \ + "vmovd %%eax, %%xmm25 \n\t" \ + "vbroadcastss %%xmm25, "#rtype"24 \n\t" \ + "vcvtps2dq "#rtype"0, "#rtype"0 \n\t" \ + "vcvtps2dq "#rtype"1, "#rtype"1 \n\t" \ + "vcvtps2dq "#rtype"2, "#rtype"2 \n\t" \ + "vcvtps2dq "#rtype"3, "#rtype"3 \n\t" \ + "vpaddd "#rtype"0, "#rtype"24, "#rtype"0 \n\t" \ + "vpaddd "#rtype"1, "#rtype"24, "#rtype"1 \n\t" \ + "vpaddd "#rtype"2, "#rtype"24, "#rtype"2 \n\t" \ + "vpaddd "#rtype"3, "#rtype"24, "#rtype"3 \n\t" + +#define convert6PsRegs2U8(rtype) \ + "mov $128, %%eax \n\t" \ + "vmovd %%eax, %%xmm25 \n\t" \ + "vbroadcastss %%xmm25, "#rtype"24 \n\t" \ + "vcvtps2dq "#rtype"0, "#rtype"0 \n\t" \ + "vcvtps2dq "#rtype"1, "#rtype"1 \n\t" \ + "vcvtps2dq "#rtype"2, "#rtype"2 \n\t" \ + "vcvtps2dq "#rtype"3, "#rtype"3 \n\t" \ + "vcvtps2dq "#rtype"4, "#rtype"4 \n\t" \ + "vcvtps2dq "#rtype"5, "#rtype"5 \n\t" \ + "vpaddd "#rtype"0, "#rtype"24, "#rtype"0 \n\t" \ + "vpaddd "#rtype"1, "#rtype"24, "#rtype"1 \n\t" \ + "vpaddd "#rtype"2, "#rtype"24, "#rtype"2 \n\t" \ + "vpaddd "#rtype"3, "#rtype"24, "#rtype"3 \n\t" \ + "vpaddd "#rtype"4, "#rtype"24, "#rtype"4 \n\t" \ + "vpaddd "#rtype"5, "#rtype"24, "#rtype"5 \n\t" + +#define convert12PsRegs2U8(rtype) \ + convert6PsRegs2U8(rtype) \ + "vcvtps2dq "#rtype"6, "#rtype"6 \n\t" \ + "vcvtps2dq "#rtype"7, "#rtype"7 \n\t" \ + "vcvtps2dq "#rtype"8, "#rtype"8 \n\t" \ + "vcvtps2dq "#rtype"9, "#rtype"9 \n\t" \ + "vcvtps2dq "#rtype"10, "#rtype"10 \n\t" \ + "vcvtps2dq "#rtype"11, "#rtype"11 \n\t" \ + "vpaddd "#rtype"6, "#rtype"24, "#rtype"6 \n\t" \ + "vpaddd "#rtype"7, "#rtype"24, "#rtype"7 \n\t" \ + "vpaddd "#rtype"8, "#rtype"24, "#rtype"8 \n\t" \ + "vpaddd "#rtype"9, "#rtype"24, "#rtype"9 \n\t" \ + "vpaddd "#rtype"10, "#rtype"24, "#rtype"10 \n\t" \ + "vpaddd "#rtype"11, "#rtype"24, "#rtype"11 \n\t" + +#define convert24PsRegs2U8(rtype) \ + convert12PsRegs2U8(rtype) \ + "vcvtps2dq "#rtype"12, "#rtype"12 \n\t" \ + "vcvtps2dq "#rtype"13, "#rtype"13 \n\t" \ + "vcvtps2dq "#rtype"14, "#rtype"14 \n\t" \ + "vcvtps2dq "#rtype"15, "#rtype"15 \n\t" \ + "vcvtps2dq "#rtype"16, "#rtype"16 \n\t" \ + "vcvtps2dq "#rtype"17, "#rtype"17 \n\t" \ + "vcvtps2dq "#rtype"18, "#rtype"18 \n\t" \ + "vcvtps2dq "#rtype"19, "#rtype"19 \n\t" \ + "vcvtps2dq "#rtype"20, "#rtype"20 \n\t" \ + "vcvtps2dq "#rtype"21, "#rtype"21 \n\t" \ + "vcvtps2dq "#rtype"22, "#rtype"22 \n\t" \ + "vcvtps2dq "#rtype"23, "#rtype"23 \n\t" \ + "vpaddd "#rtype"12, "#rtype"24, "#rtype"12 \n\t" \ + "vpaddd "#rtype"13, "#rtype"24, "#rtype"13 \n\t" \ + "vpaddd "#rtype"14, "#rtype"24, "#rtype"14 \n\t" \ + "vpaddd "#rtype"15, "#rtype"24, "#rtype"15 \n\t" \ + "vpaddd "#rtype"16, "#rtype"24, "#rtype"16 \n\t" \ + "vpaddd "#rtype"17, "#rtype"24, "#rtype"17 \n\t" \ + "vpaddd "#rtype"18, "#rtype"24, "#rtype"18 \n\t" \ + "vpaddd "#rtype"19, "#rtype"24, "#rtype"19 \n\t" \ + "vpaddd "#rtype"20, "#rtype"24, "#rtype"20 \n\t" \ + "vpaddd "#rtype"21, "#rtype"24, "#rtype"21 \n\t" \ + "vpaddd "#rtype"22, "#rtype"24, "#rtype"22 \n\t" \ + "vpaddd "#rtype"23, "#rtype"24, "#rtype"23 \n\t" + +#define mmm_1_48(A, K) \ + "movq "#A", %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm30 \n\t" \ + "vpbroadcastd 0x4(%%rax), %%zmm31 \n\t" \ + "vmovups (%[B]), %%zmm27 \n\t" \ + "vmovups 0x40(%[B]), %%zmm28 \n\t" \ + "vmovups 0x80(%[B]), %%zmm29 \n\t" \ + "vpdpbusd %%zmm24, %%zmm30, %%zmm0 \n\t" \ + "vpdpbusd %%zmm25, %%zmm30, %%zmm1 \n\t" \ + "vpdpbusd %%zmm26, %%zmm30, %%zmm2 \n\t" \ + "vmovups 0xC0(%[B]), %%zmm24 \n\t" \ + "vmovups 0x100(%[B]), %%zmm25 \n\t" \ + "vmovups 0x140(%[B]), %%zmm26 \n\t" \ + "vpdpbusd %%zmm27, %%zmm31, %%zmm0 \n\t" \ + "vpdpbusd %%zmm28, %%zmm31, %%zmm1 \n\t" \ + "vpdpbusd %%zmm29, %%zmm31, %%zmm2 \n\t" + +#define mmm_2_48(A, K) \ + "movq "#A", %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm31 \n\t" \ + "prefetcht0 0xC0(%1) \n\t" \ + "prefetcht0 0x100(%1) \n\t" \ + "prefetcht0 0x140(%1) \n\t" \ + "vmovups (%[B]), %%zmm27 \n\t" \ + "vpdpbusd %%zmm24, %%zmm30, %%zmm0 \n\t" \ + "vpdpbusd %%zmm25, %%zmm30, %%zmm1 \n\t" \ + "vmovups 0x40(%[B]), %%zmm28 \n\t" \ + "vpdpbusd %%zmm26, %%zmm30, %%zmm2 \n\t" \ + "vpdpbusd %%zmm24, %%zmm31, %%zmm3 \n\t" \ + "vmovups 0x80(%[B]), %%zmm29 \n\t" \ + "vpdpbusd %%zmm25, %%zmm31, %%zmm4 \n\t" \ + "vpdpbusd %%zmm26, %%zmm31, %%zmm5 \n\t" \ + "vpbroadcastd 0x4(%%rax), %%zmm30 \n\t" \ + "vpbroadcastd 0x4(%%rax, "#K"), %%zmm31 \n\t" \ + "prefetcht0 0x180(%[B]) \n\t" \ + "prefetcht0 0x1C0(%[B]) \n\t" \ + "prefetcht0 0x200(%[B]) \n\t" \ + "vmovups 0xC0(%[B]), %%zmm24 \n\t" \ + "vpdpbusd %%zmm27, %%zmm30, %%zmm0 \n\t" \ + "vpdpbusd %%zmm28, %%zmm30, %%zmm1 \n\t" \ + "vmovups 0x100(%[B]), %%zmm25 \n\t" \ + "vpdpbusd %%zmm29, %%zmm30, %%zmm2 \n\t" \ + "vpdpbusd %%zmm27, %%zmm31, %%zmm3 \n\t" \ + "vmovups 0x140(%[B]), %%zmm26 \n\t" \ + "vpdpbusd %%zmm28, %%zmm31, %%zmm4 \n\t" \ + "vpdpbusd %%zmm29, %%zmm31, %%zmm5 \n\t" + +#define mmm_4_48(A, K) \ + "movq "#A", %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm31 \n\t" \ + "prefetcht0 0xC0(%1) \n\t" \ + "prefetcht0 0x100(%1) \n\t" \ + "prefetcht0 0x140(%1) \n\t" \ + "vmovups (%[B]), %%zmm27 \n\t" \ + "vpdpbusd %%zmm24, %%zmm30, %%zmm0 \n\t" \ + "vpdpbusd %%zmm25, %%zmm30, %%zmm1 \n\t" \ + "vpdpbusd %%zmm26, %%zmm30, %%zmm2 \n\t" \ + "vmovups 0x40(%[B]), %%zmm28 \n\t" \ + "vpdpbusd %%zmm24, %%zmm31, %%zmm3 \n\t" \ + "vpdpbusd %%zmm25, %%zmm31, %%zmm4 \n\t" \ + "vpdpbusd %%zmm26, %%zmm31, %%zmm5 \n\t" \ + "addq "#K", %%rax \n\t" \ + "addq "#K", %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm31 \n\t" \ + "vmovups 0x80(%[B]), %%zmm29 \n\t" \ + "vpdpbusd %%zmm24, %%zmm30, %%zmm6 \n\t" \ + "vpdpbusd %%zmm25, %%zmm30, %%zmm7 \n\t" \ + "vpdpbusd %%zmm26, %%zmm30, %%zmm8 \n\t" \ + "vpdpbusd %%zmm24, %%zmm31, %%zmm9 \n\t" \ + "vpdpbusd %%zmm25, %%zmm31, %%zmm10 \n\t" \ + "vpdpbusd %%zmm26, %%zmm31, %%zmm11 \n\t" \ + "movq "#A", %%rax \n\t" \ + "addq $0x4, %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm31 \n\t" \ + "prefetcht0 0x180(%[B]) \n\t" \ + "prefetcht0 0x1C0(%[B]) \n\t" \ + "prefetcht0 0x200(%[B]) \n\t" \ + "vmovups 0xC0(%[B]), %%zmm24 \n\t" \ + "vpdpbusd %%zmm27, %%zmm30, %%zmm0 \n\t" \ + "vpdpbusd %%zmm28, %%zmm30, %%zmm1 \n\t" \ + "vpdpbusd %%zmm29, %%zmm30, %%zmm2 \n\t" \ + "vmovups 0x100(%[B]), %%zmm25 \n\t" \ + "vpdpbusd %%zmm27, %%zmm31, %%zmm3 \n\t" \ + "vpdpbusd %%zmm28, %%zmm31, %%zmm4 \n\t" \ + "vpdpbusd %%zmm29, %%zmm31, %%zmm5 \n\t" \ + "addq "#K", %%rax \n\t" \ + "addq "#K", %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm31 \n\t" \ + "vmovups 0x140(%[B]), %%zmm26 \n\t" \ + "vpdpbusd %%zmm27, %%zmm30, %%zmm6 \n\t" \ + "vpdpbusd %%zmm28, %%zmm30, %%zmm7 \n\t" \ + "vpdpbusd %%zmm29, %%zmm30, %%zmm8 \n\t" \ + "vpdpbusd %%zmm27, %%zmm31, %%zmm9 \n\t" \ + "vpdpbusd %%zmm28, %%zmm31, %%zmm10 \n\t" \ + "vpdpbusd %%zmm29, %%zmm31, %%zmm11 \n\t" + +#define mmm_8_48(A, K) \ + "movq "#A", %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm31 \n\t" \ + "prefetcht0 0xC0(%[B]) \n\t" \ + "prefetcht0 0x100(%[B]) \n\t" \ + "prefetcht0 0x140(%[B]) \n\t" \ + "vpdpbusd %%zmm24, %%zmm30, %%zmm0 \n\t" \ + "vpdpbusd %%zmm25, %%zmm30, %%zmm1 \n\t" \ + "vpdpbusd %%zmm26, %%zmm30, %%zmm2 \n\t" \ + "vpdpbusd %%zmm24, %%zmm31, %%zmm3 \n\t" \ + "vpdpbusd %%zmm25, %%zmm31, %%zmm4 \n\t" \ + "vpdpbusd %%zmm26, %%zmm31, %%zmm5 \n\t" \ + "addq "#K", %%rax \n\t" \ + "addq "#K", %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm31 \n\t" \ + "vmovups (%[B]), %%zmm27 \n\t" \ + "vpdpbusd %%zmm24, %%zmm30, %%zmm6 \n\t" \ + "vpdpbusd %%zmm25, %%zmm30, %%zmm7 \n\t" \ + "vpdpbusd %%zmm26, %%zmm30, %%zmm8 \n\t" \ + "vpdpbusd %%zmm24, %%zmm31, %%zmm9 \n\t" \ + "vpdpbusd %%zmm25, %%zmm31, %%zmm10 \n\t" \ + "vpdpbusd %%zmm26, %%zmm31, %%zmm11 \n\t" \ + "addq "#K", %%rax \n\t" \ + "addq "#K", %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm31 \n\t" \ + "vmovups 0x40(%[B]), %%zmm28 \n\t" \ + "vpdpbusd %%zmm24, %%zmm30, %%zmm12 \n\t" \ + "vpdpbusd %%zmm25, %%zmm30, %%zmm13 \n\t" \ + "vpdpbusd %%zmm26, %%zmm30, %%zmm14 \n\t" \ + "vpdpbusd %%zmm24, %%zmm31, %%zmm15 \n\t" \ + "vpdpbusd %%zmm25, %%zmm31, %%zmm16 \n\t" \ + "vpdpbusd %%zmm26, %%zmm31, %%zmm17 \n\t" \ + "addq "#K", %%rax \n\t" \ + "addq "#K", %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm31 \n\t" \ + "vmovups 0x80(%[B]), %%zmm29 \n\t" \ + "vpdpbusd %%zmm24, %%zmm30, %%zmm18 \n\t" \ + "vpdpbusd %%zmm25, %%zmm30, %%zmm19 \n\t" \ + "vpdpbusd %%zmm26, %%zmm30, %%zmm20 \n\t" \ + "vpdpbusd %%zmm24, %%zmm31, %%zmm21 \n\t" \ + "vpdpbusd %%zmm25, %%zmm31, %%zmm22 \n\t" \ + "vpdpbusd %%zmm26, %%zmm31, %%zmm23 \n\t" \ + "movq "#A", %%rax \n\t" \ + "addq $0x4, %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm31 \n\t" \ + "prefetcht0 0x180(%[B]) \n\t" \ + "prefetcht0 0x1C0(%[B]) \n\t" \ + "prefetcht0 0x200(%[B]) \n\t" \ + "vpdpbusd %%zmm27, %%zmm30, %%zmm0 \n\t" \ + "vpdpbusd %%zmm28, %%zmm30, %%zmm1 \n\t" \ + "vpdpbusd %%zmm29, %%zmm30, %%zmm2 \n\t" \ + "vpdpbusd %%zmm27, %%zmm31, %%zmm3 \n\t" \ + "vpdpbusd %%zmm28, %%zmm31, %%zmm4 \n\t" \ + "vpdpbusd %%zmm29, %%zmm31, %%zmm5 \n\t" \ + "addq "#K", %%rax \n\t" \ + "addq "#K", %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm31 \n\t" \ + "vmovups 0xC0(%[B]), %%zmm24 \n\t" \ + "vpdpbusd %%zmm27, %%zmm30, %%zmm6 \n\t" \ + "vpdpbusd %%zmm28, %%zmm30, %%zmm7 \n\t" \ + "vpdpbusd %%zmm29, %%zmm30, %%zmm8 \n\t" \ + "vpdpbusd %%zmm27, %%zmm31, %%zmm9 \n\t" \ + "vpdpbusd %%zmm28, %%zmm31, %%zmm10 \n\t" \ + "vpdpbusd %%zmm29, %%zmm31, %%zmm11 \n\t" \ + "addq "#K", %%rax \n\t" \ + "addq "#K", %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm31 \n\t" \ + "vmovups 0x100(%[B]), %%zmm25 \n\t" \ + "vpdpbusd %%zmm27, %%zmm30, %%zmm12 \n\t" \ + "vpdpbusd %%zmm28, %%zmm30, %%zmm13 \n\t" \ + "vpdpbusd %%zmm29, %%zmm30, %%zmm14 \n\t" \ + "vpdpbusd %%zmm27, %%zmm31, %%zmm15 \n\t" \ + "vpdpbusd %%zmm28, %%zmm31, %%zmm16 \n\t" \ + "vpdpbusd %%zmm29, %%zmm31, %%zmm17 \n\t" \ + "addq "#K", %%rax \n\t" \ + "addq "#K", %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm31 \n\t" \ + "vmovups 0x140(%[B]), %%zmm26 \n\t" \ + "vpdpbusd %%zmm27, %%zmm30, %%zmm18 \n\t" \ + "vpdpbusd %%zmm28, %%zmm30, %%zmm19 \n\t" \ + "vpdpbusd %%zmm29, %%zmm30, %%zmm20 \n\t" \ + "vpdpbusd %%zmm27, %%zmm31, %%zmm21 \n\t" \ + "vpdpbusd %%zmm28, %%zmm31, %%zmm22 \n\t" \ + "vpdpbusd %%zmm29, %%zmm31, %%zmm23 \n\t" + +#define mmm_1_32(A, K) \ + "movq "#A", %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm28 \n\t" \ + "vpbroadcastd 0x4(%%rax), %%zmm29 \n\t" \ + "prefetcht0 0x80(%[B]) \n\t" \ + "prefetcht0 0xC0(%[B]) \n\t" \ + "vmovups (%[B]), %%zmm26 \n\t" \ + "vmovups 0x40(%[B]), %%zmm27 \n\t" \ + "vpdpbusd %%zmm24, %%zmm28, %%zmm0 \n\t" \ + "vpdpbusd %%zmm25, %%zmm28, %%zmm1 \n\t" \ + "prefetcht0 0x100(%[B]) \n\t" \ + "prefetcht0 0x140(%[B]) \n\t" \ + "vmovups 0x80(%[B]), %%zmm24 \n\t" \ + "vmovups 0xC0(%[B]), %%zmm25 \n\t" \ + "vpdpbusd %%zmm26, %%zmm29, %%zmm0 \n\t" \ + "vpdpbusd %%zmm27, %%zmm29, %%zmm1 \n\t" + +#define mmm_3_32(A, K) \ + "movq "#A", %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), %%zmm30 \n\t" \ + "prefetcht0 0x80(%[B]) \n\t" \ + "prefetcht0 0xC0(%[B]) \n\t" \ + "vmovups (%[B]), %%zmm26 \n\t" \ + "vpdpbusd %%zmm24, %%zmm28, %%zmm0 \n\t" \ + "vpdpbusd %%zmm25, %%zmm28, %%zmm1 \n\t" \ + "vpdpbusd %%zmm24, %%zmm29, %%zmm2 \n\t" \ + "vmovups 0x40(%[B]), %%zmm27 \n\t" \ + "vpdpbusd %%zmm25, %%zmm29, %%zmm3 \n\t" \ + "vpdpbusd %%zmm24, %%zmm30, %%zmm4 \n\t" \ + "vpdpbusd %%zmm25, %%zmm30, %%zmm5 \n\t" \ + "movq "#A", %%rax \n\t" \ + "addq $0x4, %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, %%rbx), %%zmm31 \n\t" \ + "prefetcht0 0x100(%[B]) \n\t" \ + "prefetcht0 0x140(%[B]) \n\t" \ + "vmovups 0x80(%[B]), %%zmm24 \n\t" \ + "vpdpbusd %%zmm26, %%zmm28, %%zmm0 \n\t" \ + "vpdpbusd %%zmm27, %%zmm28, %%zmm1 \n\t" \ + "vpdpbusd %%zmm26, %%zmm29, %%zmm2 \n\t" \ + "vmovups 0xC0(%[B]), %%zmm25 \n\t" \ + "vpdpbusd %%zmm27, %%zmm29, %%zmm3 \n\t" \ + "vpdpbusd %%zmm26, %%zmm30, %%zmm4 \n\t" \ + "vpdpbusd %%zmm27, %%zmm30, %%zmm5 \n\t" + +#define mmm_6_32(A, K) \ + "movq "#A", %%rax \n\t" \ + "movq "#K", %%rbx \n\t" \ + "addq "#K", %%rbx \n\t" \ + "addq "#K", %%rbx \n\t" \ + "vpbroadcastd (%%rax), %%zmm28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, %%rbx), %%zmm31 \n\t" \ + "prefetcht0 0x80(%[B]) \n\t" \ + "prefetcht0 0xC0(%[B]) \n\t" \ + "vmovups (%[B]), %%zmm26 \n\t" \ + "vpdpbusd %%zmm24, %%zmm28, %%zmm0 \n\t" \ + "vpdpbusd %%zmm25, %%zmm28, %%zmm1 \n\t" \ + "vpdpbusd %%zmm24, %%zmm29, %%zmm2 \n\t" \ + "vpdpbusd %%zmm25, %%zmm29, %%zmm3 \n\t" \ + "vpdpbusd %%zmm24, %%zmm30, %%zmm4 \n\t" \ + "vpdpbusd %%zmm25, %%zmm30, %%zmm5 \n\t" \ + "vpdpbusd %%zmm24, %%zmm31, %%zmm6 \n\t" \ + "vpdpbusd %%zmm25, %%zmm31, %%zmm7 \n\t" \ + "addq "#K", %%rax \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm29 \n\t" \ + "vmovups 0x40(%[B]), %%zmm27 \n\t" \ + "vpdpbusd %%zmm24, %%zmm28, %%zmm8 \n\t" \ + "vpdpbusd %%zmm25, %%zmm28, %%zmm9 \n\t" \ + "vpdpbusd %%zmm24, %%zmm29, %%zmm10 \n\t" \ + "vpdpbusd %%zmm25, %%zmm29, %%zmm11 \n\t" \ + "movq "#A", %%rax \n\t" \ + "addq $0x4, %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, %%rbx), %%zmm31 \n\t" \ + "prefetcht0 0x100(%[B]) \n\t" \ + "prefetcht0 0x140(%[B]) \n\t" \ + "vmovups 0x80(%[B]), %%zmm24 \n\t" \ + "vpdpbusd %%zmm26, %%zmm28, %%zmm0 \n\t" \ + "vpdpbusd %%zmm27, %%zmm28, %%zmm1 \n\t" \ + "vpdpbusd %%zmm26, %%zmm29, %%zmm2 \n\t" \ + "vpdpbusd %%zmm27, %%zmm29, %%zmm3 \n\t" \ + "vpdpbusd %%zmm26, %%zmm30, %%zmm4 \n\t" \ + "vpdpbusd %%zmm27, %%zmm30, %%zmm5 \n\t" \ + "vpdpbusd %%zmm26, %%zmm31, %%zmm6 \n\t" \ + "vpdpbusd %%zmm27, %%zmm31, %%zmm7 \n\t" \ + "addq "#K", %%rax \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm29 \n\t" \ + "vmovups 0xC0(%[B]), %%zmm25 \n\t" \ + "vpdpbusd %%zmm26, %%zmm28, %%zmm8 \n\t" \ + "vpdpbusd %%zmm27, %%zmm28, %%zmm9 \n\t" \ + "vpdpbusd %%zmm26, %%zmm29, %%zmm10 \n\t" \ + "vpdpbusd %%zmm27, %%zmm29, %%zmm11 \n\t" + +#define mmm_12_32(A, K) \ + "movq "#A", %%rax \n\t" \ + "movq "#K", %%rbx \n\t" \ + "addq "#K", %%rbx \n\t" \ + "addq "#K", %%rbx \n\t" \ + "vpbroadcastd (%%rax), %%zmm28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, %%rbx), %%zmm31 \n\t" \ + "prefetcht0 0x80(%[B]) \n\t" \ + "prefetcht0 0xC0(%[B]) \n\t" \ + "vpdpbusd %%zmm24, %%zmm28, %%zmm0 \n\t" \ + "vpdpbusd %%zmm25, %%zmm28, %%zmm1 \n\t" \ + "vpdpbusd %%zmm24, %%zmm29, %%zmm2 \n\t" \ + "vpdpbusd %%zmm25, %%zmm29, %%zmm3 \n\t" \ + "vpdpbusd %%zmm24, %%zmm30, %%zmm4 \n\t" \ + "vpdpbusd %%zmm25, %%zmm30, %%zmm5 \n\t" \ + "vpdpbusd %%zmm24, %%zmm31, %%zmm6 \n\t" \ + "vpdpbusd %%zmm25, %%zmm31, %%zmm7 \n\t" \ + "addq "#K", %%rax \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, %%rbx), %%zmm31 \n\t" \ + "vmovups (%[B]), %%zmm26 \n\t" \ + "vpdpbusd %%zmm24, %%zmm28, %%zmm8 \n\t" \ + "vpdpbusd %%zmm25, %%zmm28, %%zmm9 \n\t" \ + "vpdpbusd %%zmm24, %%zmm29, %%zmm10 \n\t" \ + "vpdpbusd %%zmm25, %%zmm29, %%zmm11 \n\t" \ + "vpdpbusd %%zmm24, %%zmm30, %%zmm12 \n\t" \ + "vpdpbusd %%zmm25, %%zmm30, %%zmm13 \n\t" \ + "vpdpbusd %%zmm24, %%zmm31, %%zmm14 \n\t" \ + "vpdpbusd %%zmm25, %%zmm31, %%zmm15 \n\t" \ + "addq "#K", %%rax \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, %%rbx), %%zmm31 \n\t" \ + "vmovups 0x40(%[B]), %%zmm27 \n\t" \ + "vpdpbusd %%zmm24, %%zmm28, %%zmm16 \n\t" \ + "vpdpbusd %%zmm25, %%zmm28, %%zmm17 \n\t" \ + "vpdpbusd %%zmm24, %%zmm29, %%zmm18 \n\t" \ + "vpdpbusd %%zmm25, %%zmm29, %%zmm19 \n\t" \ + "vpdpbusd %%zmm24, %%zmm30, %%zmm20 \n\t" \ + "vpdpbusd %%zmm25, %%zmm30, %%zmm21 \n\t" \ + "vpdpbusd %%zmm24, %%zmm31, %%zmm22 \n\t" \ + "vpdpbusd %%zmm25, %%zmm31, %%zmm23 \n\t" \ + "movq "#A", %%rax \n\t" \ + "addq $0x4, %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, %%rbx), %%zmm31 \n\t" \ + "prefetcht0 0x100(%[B]) \n\t" \ + "prefetcht0 0x140(%[B]) \n\t" \ + "vpdpbusd %%zmm26, %%zmm28, %%zmm0 \n\t" \ + "vpdpbusd %%zmm27, %%zmm28, %%zmm1 \n\t" \ + "vpdpbusd %%zmm26, %%zmm29, %%zmm2 \n\t" \ + "vpdpbusd %%zmm27, %%zmm29, %%zmm3 \n\t" \ + "vpdpbusd %%zmm26, %%zmm30, %%zmm4 \n\t" \ + "vpdpbusd %%zmm27, %%zmm30, %%zmm5 \n\t" \ + "vpdpbusd %%zmm26, %%zmm31, %%zmm6 \n\t" \ + "vpdpbusd %%zmm27, %%zmm31, %%zmm7 \n\t" \ + "addq "#K", %%rax \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, %%rbx), %%zmm31 \n\t" \ + "vmovups 0x80(%[B]), %%zmm24 \n\t" \ + "vpdpbusd %%zmm26, %%zmm28, %%zmm8 \n\t" \ + "vpdpbusd %%zmm27, %%zmm28, %%zmm9 \n\t" \ + "vpdpbusd %%zmm26, %%zmm29, %%zmm10 \n\t" \ + "vpdpbusd %%zmm27, %%zmm29, %%zmm11 \n\t" \ + "vpdpbusd %%zmm26, %%zmm30, %%zmm12 \n\t" \ + "vpdpbusd %%zmm27, %%zmm30, %%zmm13 \n\t" \ + "vpdpbusd %%zmm26, %%zmm31, %%zmm14 \n\t" \ + "vpdpbusd %%zmm27, %%zmm31, %%zmm15 \n\t" \ + "addq "#K", %%rax \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), %%zmm28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), %%zmm29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), %%zmm30 \n\t" \ + "vpbroadcastd (%%rax, %%rbx), %%zmm31 \n\t" \ + "vmovups 0xC0(%[B]), %%zmm25 \n\t" \ + "vpdpbusd %%zmm26, %%zmm28, %%zmm16 \n\t" \ + "vpdpbusd %%zmm27, %%zmm28, %%zmm17 \n\t" \ + "vpdpbusd %%zmm26, %%zmm29, %%zmm18 \n\t" \ + "vpdpbusd %%zmm27, %%zmm29, %%zmm19 \n\t" \ + "vpdpbusd %%zmm26, %%zmm30, %%zmm20 \n\t" \ + "vpdpbusd %%zmm27, %%zmm30, %%zmm21 \n\t" \ + "vpdpbusd %%zmm26, %%zmm31, %%zmm22 \n\t" \ + "vpdpbusd %%zmm27, %%zmm31, %%zmm23 \n\t" + +#define mmm_1_16(A, K, rtype, off) \ + "vpbroadcastd ("#A"), "#rtype"25 \n\t" \ + "vpbroadcastd 0x4("#A"), "#rtype"26 \n\t" \ + "vmovups (%[B]), "#rtype"31 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"25, "#rtype"0 \n\t" \ + "vmovups "#off"(%[B]), "#rtype"24 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"26, "#rtype"0 \n\t" + +#define mmm_6_16(A, K, rtype, off) \ + "movq "#A", %%rax \n\t" \ + "movq "#K", %%rbx \n\t" \ + "addq "#K", %%rbx \n\t" \ + "addq "#K", %%rbx \n\t" \ + "vpbroadcastd (%%rax), "#rtype"25 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"26 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"27 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"30 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"25, "#rtype"0 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"26, "#rtype"1 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"27, "#rtype"2 \n\t" \ + "vmovups (%[B]), "#rtype"31 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"28, "#rtype"3 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"29, "#rtype"4 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"30, "#rtype"5 \n\t" \ + "movq "#A", %%rax \n\t" \ + "addq $0x4, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"25 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"26 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"27 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"30 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"25, "#rtype"0 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"26, "#rtype"1 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"27, "#rtype"2 \n\t" \ + "vmovups "#off"(%[B]), "#rtype"24 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"28, "#rtype"3 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"29, "#rtype"4 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"30, "#rtype"5 \n\t" \ + +#define mmm_12_16(A, K, rtype, off) \ + "movq "#A", %%rax \n\t" \ + "movq "#K", %%rbx \n\t" \ + "addq "#K", %%rbx \n\t" \ + "addq "#K", %%rbx \n\t" \ + "vpbroadcastd (%%rax), "#rtype"25 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"26 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"27 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"30 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"25, "#rtype"0 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"26, "#rtype"1 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"27, "#rtype"2 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"28, "#rtype"3 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"29, "#rtype"4 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"30, "#rtype"5 \n\t" \ + "vmovups (%[B]), "#rtype"31 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"25 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"26 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"27 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"30 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"25, "#rtype"6 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"26, "#rtype"7 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"27, "#rtype"8 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"28, "#rtype"9 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"29, "#rtype"10 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"30, "#rtype"11 \n\t" \ + "movq "#A", %%rax \n\t" \ + "addq $0x4, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"25 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"26 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"27 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"30 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"25, "#rtype"0 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"26, "#rtype"1 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"27, "#rtype"2 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"28, "#rtype"3 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"29, "#rtype"4 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"30, "#rtype"5 \n\t" \ + "vmovups "#off"(%[B]), "#rtype"24 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"25 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"26 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"27 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"30 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"25, "#rtype"6 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"26, "#rtype"7 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"27, "#rtype"8 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"28, "#rtype"9 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"29, "#rtype"10 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"30, "#rtype"11 \n\t" + +#define mmm_24_16(A, K, rtype, off) \ + "movq "#A", %%rax \n\t" \ + "movq "#K", %%rbx \n\t" \ + "addq "#K", %%rbx \n\t" \ + "addq "#K", %%rbx \n\t" \ + "vpbroadcastd (%%rax), "#rtype"25 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"26 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"27 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"30 \n\t" \ + "prefetcht0 0x80(%[B]) \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"25, "#rtype"0 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"26, "#rtype"1 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"27, "#rtype"2 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"28, "#rtype"3 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"29, "#rtype"4 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"30, "#rtype"5 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"25 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"26 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"27 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"30 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"25, "#rtype"6 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"26, "#rtype"7 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"27, "#rtype"8 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"28, "#rtype"9 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"29, "#rtype"10 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"30, "#rtype"11 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"25 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"26 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"27 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"30 \n\t" \ + "vmovups (%[B]), "#rtype"31 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"25, "#rtype"12 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"26, "#rtype"13 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"27, "#rtype"14 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"28, "#rtype"15 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"29, "#rtype"16 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"30, "#rtype"17 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"25 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"26 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"27 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"30 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"25, "#rtype"18 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"26, "#rtype"19 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"27, "#rtype"20 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"28, "#rtype"21 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"29, "#rtype"22 \n\t" \ + "vpdpbusd "#rtype"24, "#rtype"30, "#rtype"23 \n\t" \ + "movq "#A", %%rax \n\t" \ + "addq $0x4, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"25 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"26 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"27 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"30 \n\t" \ + "prefetcht0 0xC0(%[B]) \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"25, "#rtype"0 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"26, "#rtype"1 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"27, "#rtype"2 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"28, "#rtype"3 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"29, "#rtype"4 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"30, "#rtype"5 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"25 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"26 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"27 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"30 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"25, "#rtype"6 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"26, "#rtype"7 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"27, "#rtype"8 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"28, "#rtype"9 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"29, "#rtype"10 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"30, "#rtype"11 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"25 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"26 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"27 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"30 \n\t" \ + "vmovups "#off"(%[B]), "#rtype"24 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"25, "#rtype"12 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"26, "#rtype"13 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"27, "#rtype"14 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"28, "#rtype"15 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"29, "#rtype"16 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"30, "#rtype"17 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"25 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"26 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"27 \n\t" \ + "addq %%rbx, %%rax \n\t" \ + "vpbroadcastd (%%rax), "#rtype"28 \n\t" \ + "vpbroadcastd (%%rax, "#K"), "#rtype"29 \n\t" \ + "vpbroadcastd (%%rax, "#K", 2), "#rtype"30 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"25, "#rtype"18 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"26, "#rtype"19 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"27, "#rtype"20 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"28, "#rtype"21 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"29, "#rtype"22 \n\t" \ + "vpdpbusd "#rtype"31, "#rtype"30, "#rtype"23 \n\t" + +#define mmm_m_48_asm(m, n, nRegs, mRegs, edge) \ + __asm__ __volatile__( \ + "prefetcht0 0xC0(%[B]) \n\t" \ + "prefetcht0 0x100(%[B]) \n\t" \ + "prefetcht0 0x140(%[B]) \n\t" \ + "vmovups (%[B]), %%zmm24 \n\t" \ + "vmovups 0x40(%[B]), %%zmm25 \n\t" \ + "vmovups 0x80(%[B]), %%zmm26 \n\t" \ + "add $0xC0, %[B] \n\t" \ + "movq %[flags], %%rax \n\t" \ + "andq $0x1, %%rax \n\t" \ + "jne 0f \n\t" \ + loadOffset_##m##_##n \ + "jmp 1f \n\t" \ + ".align 16 \n\t" \ + "0: \n\t" \ + clear##nRegs##Regs(%%zmm) \ + ".align 16 \n\t" \ + "1: \n\t" \ + "movq %[C], %%rax \n\t" \ + "add %[N], %%rax \n\t" \ + "prefetcht0 (%%rax) \n\t" \ + "prefetcht0 0x40(%%rax) \n\t" \ + "prefetcht0 0x80(%%rax) \n\t" \ + "prefetcht0 (%%rax, %[N]) \n\t" \ + "prefetcht0 0x40(%%rax, %[N]) \n\t" \ + "prefetcht0 0x80(%%rax, %[N]) \n\t" \ + "add %[N], %%rax \n\t" \ + "prefetcht0 (%%rax) \n\t" \ + "prefetcht0 0x40(%%rax) \n\t" \ + "prefetcht0 0x80(%%rax) \n\t" \ + "prefetcht0 (%%rax, %[N]) \n\t" \ + "prefetcht0 0x40(%%rax, %[N]) \n\t" \ + "prefetcht0 0x80(%%rax, %[N]) \n\t" \ + "add %[N], %%rax \n\t" \ + "prefetcht0 (%%rax) \n\t" \ + "prefetcht0 0x40(%%rax) \n\t" \ + "prefetcht0 0x80(%%rax) \n\t" \ + "prefetcht0 (%%rax, %[N]) \n\t" \ + "prefetcht0 0x40(%%rax, %[N]) \n\t" \ + "prefetcht0 0x80(%%rax, %[N]) \n\t" \ + "add %[N], %%rax \n\t" \ + "prefetcht0 (%%rax) \n\t" \ + "prefetcht0 0x40(%%rax) \n\t" \ + "prefetcht0 0x80(%%rax) \n\t" \ + "prefetcht0 (%%rax, %[N]) \n\t" \ + "prefetcht0 0x40(%%rax, %[N]) \n\t" \ + "prefetcht0 0x80(%%rax, %[N]) \n\t" \ + "movq %[bk], %%rcx \n\t" \ + "shr $3, %%rcx \n\t" \ + "je 3f \n\t" \ + ".align 16 \n\t" \ + "2: \n\t" \ + mmm_##m##_48(%[A], %[K]) \ + "add $0x180, %[B] \n\t" \ + "add $0x8, %[A] \n\t" \ + "dec %%rcx \n\t" \ + "jg 2b \n\t" \ + ".align 16 \n\t" \ + "3: \n\t" \ + "movq %[bk], %%rcx \n\t" \ + "and $7, %%rcx \n\t" \ + "je 4f \n\t" \ + "movq $8, %%rcx \n\t" \ + mmm_##m##_48(%[resK], %%rcx) \ + ".align 16 \n\t" \ + "4: \n\t" \ + "movq %[C], %%rax \n\t" \ + "movq %[N], %%rcx \n\t" \ + "addq %[N], %%rcx \n\t" \ + addC_##m##_##n(%%rax) \ + "cmpq $0x0, %[s] \n\t" \ + "je 5f \n\t" \ + convert##nRegs##I32Regs2Ps(%%zmm, %[s]) \ + "movq %[flags], %%rax \n\t" \ + "andq $0x2, %%rax \n\t" \ + "je 5f \n\t" \ + convert##nRegs##PsRegs2U8(%%zmm) \ + storeC_##mRegs##_##n##_##edge(vpmovusdb, %%zmm, %[u8C], 0x10, 0x20) \ + "jmp 6f \n\t" \ + ".align 16 \n\t" \ + "5: \n\t" \ + storeC_##mRegs##_##n##_##edge(vmovups, %%zmm, %[C], 0x40, 0x80) \ + ".align 16 \n\t" \ + "6: \n\t" \ + : [B] "+r" (matrixB) \ + : [A] "r" (matrixA), \ + [C] "r" (matrixC), \ + [bk] "r" ((int64_t)bk), \ + [N]"r" ((int64_t)(N * 4)), \ + [s] "r" (scale), \ + [K] "r" ((int64_t)stepK), \ + [offset] "r" (offsetC), \ + [flags] "b" ((int64_t)flags), \ + [u8C] "r" (u8Result), \ + [nmask] "r" (nmask), \ + [resK] "r" (resK) \ + : "%rax", "%rcx", \ + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", \ + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", \ + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \ + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \ + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", \ + "%zmm30", "%zmm31", "memory", "cc"); + +#define mmm_m_32_asm(m, n, nRegs, mRegs, edge) \ + __asm__ __volatile__( \ + "prefetcht0 0xC0(%[B]) \n\t" \ + "prefetcht0 0x100(%[B]) \n\t" \ + "vmovups (%[B]), %%zmm24 \n\t" \ + "vmovups 0x40(%[B]), %%zmm25 \n\t" \ + "add $0x80, %[B] \n\t" \ + "movq %[flags], %%rax \n\t" \ + "andq $0x1, %%rax \n\t" \ + "jne 0f \n\t" \ + loadOffset_##m##_##n \ + "jmp 1f \n\t" \ + ".align 16 \n\t" \ + "0: \n\t" \ + clear##nRegs##Regs(%%zmm) \ + ".align 16 \n\t" \ + "1: \n\t" \ + "movq %[C], %%rax \n\t" \ + "add %[N], %%rax \n\t" \ + "prefetcht0 (%%rax) \n\t" \ + "prefetcht0 0x40(%%rax) \n\t" \ + "prefetcht0 (%%rax, %[N]) \n\t" \ + "prefetcht0 0x40(%%rax, %[N]) \n\t" \ + "add %[N], %%rax \n\t" \ + "prefetcht0 (%%rax) \n\t" \ + "prefetcht0 0x40(%%rax) \n\t" \ + "prefetcht0 (%%rax, %[N]) \n\t" \ + "prefetcht0 0x40(%%rax, %[N]) \n\t" \ + "add %[N], %%rax \n\t" \ + "prefetcht0 (%%rax) \n\t" \ + "prefetcht0 0x40(%%rax) \n\t" \ + "prefetcht0 (%%rax, %[N]) \n\t" \ + "prefetcht0 0x40(%%rax, %[N]) \n\t" \ + "add %[N], %%rax \n\t" \ + "prefetcht0 (%%rax) \n\t" \ + "prefetcht0 0x40(%%rax) \n\t" \ + "prefetcht0 (%%rax, %[N]) \n\t" \ + "prefetcht0 0x40(%%rax, %[N]) \n\t" \ + "movq %[bk], %%rcx \n\t" \ + "shr $3, %%rcx \n\t" \ + "je 3f \n\t" \ + ".align 16 \n\t" \ + "2: \n\t" \ + mmm_##m##_32(%[A], %[K]) \ + "add $0x100, %[B] \n\t" \ + "add $0x8, %[A] \n\t" \ + "dec %%rcx \n\t" \ + "jg 2b \n\t" \ + ".align 16 \n\t" \ + "3: \n\t" \ + "movq %[bk], %%rcx \n\t" \ + "and $7, %%rcx \n\t" \ + "je 4f \n\t" \ + "movq $8, %%rcx \n\t" \ + mmm_##m##_32(%[resK], %%rcx) \ + ".align 16 \n\t" \ + "4: \n\t" \ + addC_##m##_##n(%[C]) \ + "cmpq $0x0, %[s] \n\t" \ + "je 5f \n\t" \ + convert##nRegs##I32Regs2Ps(%%zmm, %[s]) \ + "movq %[flags], %%rax \n\t" \ + "andq $0x2, %%rax \n\t" \ + "je 5f \n\t" \ + convert##nRegs##PsRegs2U8(%%zmm) \ + storeC_##mRegs##_##n##_##edge(vpmovusdb, %%zmm, %[u8C], 0x10, 0x20) \ + "jmp 6f \n\t" \ + ".align 16 \n\t" \ + "5: \n\t" \ + storeC_##mRegs##_##n##_##edge(vmovups, %%zmm, %[C], 0x40, 0x80) \ + ".align 16 \n\t" \ + "6: \n\t" \ + : [B] "+r" (matrixB) \ + : [A] "r" (matrixA), \ + [C] "r" (matrixC), \ + [bk] "r" ((int64_t)bk), \ + [N]"r" ((int64_t)(N * 4)), \ + [s] "r" (scale), \ + [K] "r" ((int64_t)stepK), \ + [offset] "r" (offsetC), \ + [flags] "b" ((int64_t)flags), \ + [u8C] "r" (u8Result), \ + [nmask] "r" (nmask), \ + [resK] "r" (resK) \ + : "%rax", "%rcx", \ + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", \ + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", \ + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \ + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \ + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", \ + "%zmm30", "%zmm31", "memory", "cc"); + +#define mmm_m_16_8_asm(m, n, nRegs, mRegs, rtype, off0, off1, edge) \ + __asm__ __volatile__( \ + "vmovups (%[B]), "#rtype"24 \n\t" \ + "add $"#off0", %[B] \n\t" \ + "movq %[flags], %%rax \n\t" \ + "andq $0x1, %%rax \n\t" \ + "jne 0f \n\t" \ + loadOffset_##m##_##n(rtype) \ + "jmp 1f \n\t" \ + ".align 16 \n\t" \ + "0: \n\t" \ + clear##nRegs##Regs(rtype) \ + ".align 16 \n\t" \ + "1: \n\t" \ + "movq %[C], %%rax \n\t" \ + "add %[N], %%rax \n\t" \ + "prefetcht0 (%%rax) \n\t" \ + "prefetcht0 (%%rax, %[N]) \n\t" \ + "add %[N], %%rax \n\t" \ + "prefetcht0 (%%rax) \n\t" \ + "prefetcht0 (%%rax, %[N]) \n\t" \ + "add %[N], %%rax \n\t" \ + "prefetcht0 (%%rax) \n\t" \ + "prefetcht0 (%%rax, %[N]) \n\t" \ + "add %[N], %%rax \n\t" \ + "prefetcht0 (%%rax) \n\t" \ + "prefetcht0 (%%rax, %[N]) \n\t" \ + "movq %[bk], %%rcx \n\t" \ + "shr $3, %%rcx \n\t" \ + "je 3f \n\t" \ + ".align 16 \n\t" \ + "2: \n\t" \ + mmm_##m##_16(%[A], %[K], rtype, off0) \ + "add $"#off1", %[B] \n\t" \ + "add $0x8, %[A] \n\t" \ + "dec %%rcx \n\t" \ + "jg 2b \n\t" \ + ".align 16 \n\t" \ + "3: \n\t" \ + "movq %[bk], %%rcx \n\t" \ + "and $7, %%rcx \n\t" \ + "je 4f \n\t" \ + "movq $8, %%rcx \n\t" \ + mmm_##m##_16(%[resK], %%rcx, rtype, off0) \ + ".align 16 \n\t" \ + "4: \n\t" \ + addC_##m##_##n(rtype, %[C]) \ + "cmpq $0x0, %[s] \n\t" \ + "je 5f \n\t" \ + convert##nRegs##I32Regs2Ps(rtype, %[s]) \ + "movq %[flags], %%rax \n\t" \ + "andq $0x2, %%rax \n\t" \ + "je 5f \n\t" \ + convert##nRegs##PsRegs2U8(rtype) \ + storeC_##mRegs##_##n##_##edge(vpmovusdb, rtype, %[u8C], 0x0, 0x0) \ + "jmp 6f \n\t" \ + ".align 16 \n\t" \ + "5: \n\t" \ + storeC_##mRegs##_##n##_##edge(vmovups, rtype, %[C], 0x0, 0x0) \ + ".align 16 \n\t" \ + "6: \n\t" \ + : [B] "+r" (matrixB) \ + : [A] "r" (matrixA), \ + [C] "r" (matrixC), \ + [bk] "r" ((int64_t)bk), \ + [N]"r" ((int64_t)(N * 4)), \ + [s] "r" (scale), \ + [K] "r" ((int64_t)stepK), \ + [offset] "r" (offsetC), \ + [flags] "r" ((int64_t)flags), \ + [u8C] "r" (u8Result), \ + [nmask] "r" (nmask), \ + [resK] "r" (resK) \ + : "%rax", "%rbx", "%rcx", \ + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", \ + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", \ + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \ + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \ + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", \ + "%zmm30", "%zmm31", "memory", "cc"); + +#define mmm_m_16_asm(m, n, nRegs, mRegs, edge) \ + mmm_m_16_8_asm(m, n, nRegs, mRegs, %%zmm, 0x40, 0x80, edge) + +#define mmm_m_8_asm(m, n, nRegs, mRegs, edge) \ + mmm_m_16_8_asm(m, n, nRegs, mRegs, %%ymm, 0x20, 0x40, edge) + +#define mmm_m_n_asm(m, n, nRegs, mRegs, regs) \ + void mmm_avx512_##mRegs##x##n##_asm(U32 um, \ + U32 un, \ + U32 bk, \ + UINT8 *matrixA, \ + INT8 *matrixB, \ + I32 *matrixC, \ + UINT8 *u8Result, \ + I32 *offsetC, \ + U32 N, \ + U32 stepK, \ + const F32 *scale, \ + U32 nmask, \ + UINT8 *resK, \ + U32 flags) \ + { \ + if (nmask == 0) { \ + mmm_m_##n##_asm(m, nRegs, regs, mRegs, 0) \ + } else { \ + mmm_m_##n##_asm(m, nRegs, regs, mRegs, 1) \ + } \ + } + +mmm_m_n_asm(8, 48, 3, 8, 24) +mmm_m_n_asm(8, 48, 3, 7, 24) +mmm_m_n_asm(8, 48, 3, 6, 24) +mmm_m_n_asm(8, 48, 3, 5, 24) +mmm_m_n_asm(4, 48, 3, 4, 12) +mmm_m_n_asm(4, 48, 3, 3, 12) +mmm_m_n_asm(2, 48, 3, 2, 6) +mmm_m_n_asm(1, 48, 3, 1, 1) + +mmm_m_n_asm(12, 32, 2, 12, 24) +mmm_m_n_asm(12, 32, 2, 11, 24) +mmm_m_n_asm(12, 32, 2, 10, 24) +mmm_m_n_asm(12, 32, 2, 9, 24) +mmm_m_n_asm(12, 32, 2, 8, 24) +mmm_m_n_asm(12, 32, 2, 7, 24) +mmm_m_n_asm(6, 32, 2, 6, 12) +mmm_m_n_asm(6, 32, 2, 5, 12) +mmm_m_n_asm(6, 32, 2, 4, 12) +mmm_m_n_asm(3, 32, 2, 3, 6) +mmm_m_n_asm(3, 32, 2, 2, 6) +mmm_m_n_asm(1, 32, 2, 1, 1) + +mmm_m_n_asm(24, 16, 1, 24, 24) +mmm_m_n_asm(24, 16, 1, 23, 24) +mmm_m_n_asm(24, 16, 1, 22, 24) +mmm_m_n_asm(24, 16, 1, 21, 24) +mmm_m_n_asm(24, 16, 1, 20, 24) +mmm_m_n_asm(24, 16, 1, 19, 24) +mmm_m_n_asm(24, 16, 1, 18, 24) +mmm_m_n_asm(24, 16, 1, 17, 24) +mmm_m_n_asm(24, 16, 1, 16, 24) +mmm_m_n_asm(24, 16, 1, 15, 24) +mmm_m_n_asm(24, 16, 1, 14, 24) +mmm_m_n_asm(24, 16, 1, 13, 24) +mmm_m_n_asm(12, 16, 1, 12, 12) +mmm_m_n_asm(12, 16, 1, 11, 12) +mmm_m_n_asm(12, 16, 1, 10, 12) +mmm_m_n_asm(12, 16, 1, 9, 12) +mmm_m_n_asm(12, 16, 1, 8, 12) +mmm_m_n_asm(12, 16, 1, 7, 12) +mmm_m_n_asm(6, 16, 1, 6, 6) +mmm_m_n_asm(6, 16, 1, 5, 6) +mmm_m_n_asm(6, 16, 1, 4, 6) +mmm_m_n_asm(6, 16, 1, 3, 6) +mmm_m_n_asm(6, 16, 1, 2, 6) +mmm_m_n_asm(1, 16, 1, 1, 1) + +mmm_m_n_asm(24, 8, 1, 24, 24) +mmm_m_n_asm(24, 8, 1, 23, 24) +mmm_m_n_asm(24, 8, 1, 22, 24) +mmm_m_n_asm(24, 8, 1, 21, 24) +mmm_m_n_asm(24, 8, 1, 20, 24) +mmm_m_n_asm(24, 8, 1, 19, 24) +mmm_m_n_asm(24, 8, 1, 18, 24) +mmm_m_n_asm(24, 8, 1, 17, 24) +mmm_m_n_asm(24, 8, 1, 16, 24) +mmm_m_n_asm(24, 8, 1, 15, 24) +mmm_m_n_asm(24, 8, 1, 14, 24) +mmm_m_n_asm(24, 8, 1, 13, 24) +mmm_m_n_asm(12, 8, 1, 12, 12) +mmm_m_n_asm(12, 8, 1, 11, 12) +mmm_m_n_asm(12, 8, 1, 10, 12) +mmm_m_n_asm(12, 8, 1, 9, 12) +mmm_m_n_asm(12, 8, 1, 8, 12) +mmm_m_n_asm(12, 8, 1, 7, 12) +mmm_m_n_asm(6, 8, 1, 6, 6) +mmm_m_n_asm(6, 8, 1, 5, 6) +mmm_m_n_asm(6, 8, 1, 4, 6) +mmm_m_n_asm(6, 8, 1, 3, 6) +mmm_m_n_asm(6, 8, 1, 2, 6) +mmm_m_n_asm(1, 8, 1, 1, 1) + + void matrix_matrix_multiply_tmp_bytes_int8( - U32 row1, U32 col1, U32 row2, U32 col2, DataType dt, U32 *bytes) + U32 row1, U32 col1, U32 row2, U32 col2, DataFormat df, DataType dt, U32 *bytes) { - row1 = align_size(row1, SIMDW); - row2 = align_size(row2, SIMDW); - col1 = align_size(col1, SIMDW); - col2 = align_size(col2, SIMDW); - *bytes = row1 * col1 + row2 * col2 + UNI_MAX(row2, col2) * 4; - *bytes *= sizeof(dt); + U32 alignedN = UNI_ALIGN(col2, 16); + U32 alignedK = UNI_ALIGN(row2, 8); + *bytes = 2 * alignedN * bytesOf(DT_I32) + alignedN * alignedK; + if (df == DF_NORMAL) { + *bytes += 32 * col1; + if (col1 % 8 != 0) { + *bytes += UNI_ALIGN(row1, 24) * 8; + } + } else if (df == DF_TRANSPOSE) { + *bytes += UNI_ALIGN(col1, 24) * UNI_MIN(BOLCK_K_DIM, alignedK); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } *bytes += 64; } -EE matrix_matrix_multiply_transform_rhsN_int8( - TensorDesc desc, INT8 *src, INT8 *packB, I32 *offsetCBias) +// clang-format on + +EE matrix_matrix_multiply_transform_rhsN_int8(TensorDesc desc, INT8 *src, INT8 *packB) { DataType dt; DataFormat df; @@ -53,25 +2045,15 @@ EE matrix_matrix_multiply_transform_rhsN_int8( CHECK_STATUS(tensor2dGet(desc, &dt, &df, &K, &N)); U32 unrollSize[4] = {8, 16, 32, 48}; INT8 *tmpS = src; - bool hasBias = (offsetCBias != nullptr); - I32 *sumB = nullptr; - if (!hasBias) { - sumB = (I32 *)packB; - memset(sumB, 0, N * sizeof(I32)); - } else { - sumB = offsetCBias; - } - packB += N * bytesOf(DT_I32); + I32 *offsetCBias = (I32 *)(packB + UNI_ALIGN(K, SIMDW) * UNI_ALIGN(N, 16)); for (U32 bk = 0; bk < K; bk += blockSizeK) { blockSizeK = UNI_MIN(BOLCK_K_DIM, K - bk); - blockSizeK = UNI_MAX(blockSizeK % SIMDW, blockSizeK - blockSizeK % SIMDW); - U32 alignedBlockSizeK = align_size(blockSizeK, SIMDW); for (U32 un = 0; un < N; un += unrollSizeN) { unrollSizeN = UNI_MIN(UNROLL_N, N - un); - unrollSizeN = UNI_MIN(unrollSize[unrollSizeN >> 4], unrollSizeN); - matrix2_trans_l(unrollSizeN, blockSizeK, N, SIMDW, tmpS + un, packB); - packB += unrollSizeN * alignedBlockSizeK; + U32 alignedN = (unrollSizeN > 8) ? UNI_ALIGN(unrollSizeN, 16) : 8; + matrix2_trans_l(unrollSizeN, alignedN, blockSizeK, N, SIMDW, tmpS + un, packB); + packB += alignedN * UNI_ALIGN(blockSizeK, SIMDW); } tmpS += blockSizeK * N; } @@ -81,13 +2063,12 @@ EE matrix_matrix_multiply_transform_rhsN_int8( for (U32 k = 0; k < K; ++k) { tmp += (I32)(src[k * N + n]); } - sumB[n] += tmp * (-128); + offsetCBias[n] = tmp * (-128); } return SUCCESS; } -EE matrix_matrix_multiply_transform_rhsT_int8( - TensorDesc desc, INT8 *src, INT8 *packB, I32 *offsetCBias) +EE matrix_matrix_multiply_transform_rhsT_int8(TensorDesc desc, INT8 *src, INT8 *packB) { DataType dt; DataFormat df; @@ -95,25 +2076,15 @@ EE matrix_matrix_multiply_transform_rhsT_int8( CHECK_STATUS(tensor2dGet(desc, &dt, &df, &N, &K)); U32 unrollSize[4] = {8, 16, 32, 48}; INT8 *tmpS = src; - bool hasBias = (offsetCBias != nullptr); - I32 *sumB = nullptr; - if (!hasBias) { - sumB = (I32 *)packB; - memset(sumB, 0, N * sizeof(I32)); - } else { - sumB = offsetCBias; - } - packB += N * bytesOf(DT_I32); + I32 *offsetCBias = (I32 *)(packB + UNI_ALIGN(K, SIMDW) * UNI_ALIGN(N, 16)); for (U32 bk = 0; bk < K; bk += blockSizeK) { blockSizeK = UNI_MIN(BOLCK_K_DIM, K - bk); - blockSizeK = UNI_MAX(blockSizeK % SIMDW, blockSizeK - blockSizeK % SIMDW); - U32 alignedBlockSizeK = align_size(blockSizeK, SIMDW); for (U32 un = 0; un < N; un += unrollSizeN) { unrollSizeN = UNI_MIN(UNROLL_N, N - un); - unrollSizeN = UNI_MIN(unrollSize[unrollSizeN >> 4], unrollSizeN); - matrix1_trans_l(unrollSizeN, blockSizeK, K, SIMDW, tmpS + un * K, packB); - packB += unrollSizeN * alignedBlockSizeK; + U32 alignedN = (unrollSizeN > 8) ? UNI_ALIGN(unrollSizeN, 16) : 8; + matrix1_trans_l(unrollSizeN, alignedN, blockSizeK, K, SIMDW, tmpS + un * K, packB); + packB += alignedN * UNI_ALIGN(blockSizeK, SIMDW); } tmpS += blockSizeK; } @@ -123,4707 +2094,12 @@ EE matrix_matrix_multiply_transform_rhsT_int8( for (U32 k = 0; k < K; ++k) { tmp += (I32)(src[n * K + k]); } - sumB[n] += tmp * (-128); + offsetCBias[n] = tmp * (-128); } return SUCCESS; } -#ifdef _USE_AVX512_VNNI -#define mmmKernel8x48 \ - "movq %0, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm31 \n\t" \ - "prefetcht0 0xC0(%1) \n\t" \ - "prefetcht0 0x100(%1) \n\t" \ - "prefetcht0 0x140(%1) \n\t" \ - "vpdpbusd %%zmm24, %%zmm30, %%zmm0 \n\t" \ - "vpdpbusd %%zmm25, %%zmm30, %%zmm1 \n\t" \ - "vpdpbusd %%zmm26, %%zmm30, %%zmm2 \n\t" \ - "vpdpbusd %%zmm24, %%zmm31, %%zmm3 \n\t" \ - "vpdpbusd %%zmm25, %%zmm31, %%zmm4 \n\t" \ - "vpdpbusd %%zmm26, %%zmm31, %%zmm5 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %6, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm31 \n\t" \ - "vmovups (%1), %%zmm27 \n\t" \ - "vpdpbusd %%zmm24, %%zmm30, %%zmm6 \n\t" \ - "vpdpbusd %%zmm25, %%zmm30, %%zmm7 \n\t" \ - "vpdpbusd %%zmm26, %%zmm30, %%zmm8 \n\t" \ - "vpdpbusd %%zmm24, %%zmm31, %%zmm9 \n\t" \ - "vpdpbusd %%zmm25, %%zmm31, %%zmm10 \n\t" \ - "vpdpbusd %%zmm26, %%zmm31, %%zmm11 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %6, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm31 \n\t" \ - "vmovups 0x40(%1), %%zmm28 \n\t" \ - "vpdpbusd %%zmm24, %%zmm30, %%zmm12 \n\t" \ - "vpdpbusd %%zmm25, %%zmm30, %%zmm13 \n\t" \ - "vpdpbusd %%zmm26, %%zmm30, %%zmm14 \n\t" \ - "vpdpbusd %%zmm24, %%zmm31, %%zmm15 \n\t" \ - "vpdpbusd %%zmm25, %%zmm31, %%zmm16 \n\t" \ - "vpdpbusd %%zmm26, %%zmm31, %%zmm17 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %6, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm31 \n\t" \ - "vmovups 0x80(%1), %%zmm29 \n\t" \ - "vpdpbusd %%zmm24, %%zmm30, %%zmm18 \n\t" \ - "vpdpbusd %%zmm25, %%zmm30, %%zmm19 \n\t" \ - "vpdpbusd %%zmm26, %%zmm30, %%zmm20 \n\t" \ - "vpdpbusd %%zmm24, %%zmm31, %%zmm21 \n\t" \ - "vpdpbusd %%zmm25, %%zmm31, %%zmm22 \n\t" \ - "vpdpbusd %%zmm26, %%zmm31, %%zmm23 \n\t" \ - "movq %0, %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm31 \n\t" \ - "prefetcht0 0x180(%1) \n\t" \ - "prefetcht0 0x1C0(%1) \n\t" \ - "prefetcht0 0x200(%1) \n\t" \ - "vpdpbusd %%zmm27, %%zmm30, %%zmm0 \n\t" \ - "vpdpbusd %%zmm28, %%zmm30, %%zmm1 \n\t" \ - "vpdpbusd %%zmm29, %%zmm30, %%zmm2 \n\t" \ - "vpdpbusd %%zmm27, %%zmm31, %%zmm3 \n\t" \ - "vpdpbusd %%zmm28, %%zmm31, %%zmm4 \n\t" \ - "vpdpbusd %%zmm29, %%zmm31, %%zmm5 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %6, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm31 \n\t" \ - "vmovups 0xC0(%1), %%zmm24 \n\t" \ - "vpdpbusd %%zmm27, %%zmm30, %%zmm6 \n\t" \ - "vpdpbusd %%zmm28, %%zmm30, %%zmm7 \n\t" \ - "vpdpbusd %%zmm29, %%zmm30, %%zmm8 \n\t" \ - "vpdpbusd %%zmm27, %%zmm31, %%zmm9 \n\t" \ - "vpdpbusd %%zmm28, %%zmm31, %%zmm10 \n\t" \ - "vpdpbusd %%zmm29, %%zmm31, %%zmm11 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %6, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm31 \n\t" \ - "vmovups 0x100(%1), %%zmm25 \n\t" \ - "vpdpbusd %%zmm27, %%zmm30, %%zmm12 \n\t" \ - "vpdpbusd %%zmm28, %%zmm30, %%zmm13 \n\t" \ - "vpdpbusd %%zmm29, %%zmm30, %%zmm14 \n\t" \ - "vpdpbusd %%zmm27, %%zmm31, %%zmm15 \n\t" \ - "vpdpbusd %%zmm28, %%zmm31, %%zmm16 \n\t" \ - "vpdpbusd %%zmm29, %%zmm31, %%zmm17 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %6, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm31 \n\t" \ - "vmovups 0x140(%1), %%zmm26 \n\t" \ - "vpdpbusd %%zmm27, %%zmm30, %%zmm18 \n\t" \ - "vpdpbusd %%zmm28, %%zmm30, %%zmm19 \n\t" \ - "vpdpbusd %%zmm29, %%zmm30, %%zmm20 \n\t" \ - "vpdpbusd %%zmm27, %%zmm31, %%zmm21 \n\t" \ - "vpdpbusd %%zmm28, %%zmm31, %%zmm22 \n\t" \ - "vpdpbusd %%zmm29, %%zmm31, %%zmm23 \n\t" -#else -#define mmmKernel8x48 \ - "movq %0, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "prefetcht0 0xC0(%1) \n\t" \ - "prefetcht0 0x100(%1) \n\t" \ - "prefetcht0 0x140(%1) \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm30 \n\t" \ - "vpaddd %%zmm0, %%zmm27, %%zmm0 \n\t" \ - "vpaddd %%zmm1, %%zmm28, %%zmm1 \n\t" \ - "vpaddd %%zmm2, %%zmm29, %%zmm2 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %6, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpaddd %%zmm3, %%zmm27, %%zmm3 \n\t" \ - "vpaddd %%zmm4, %%zmm28, %%zmm4 \n\t" \ - "vpaddd %%zmm5, %%zmm29, %%zmm5 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm30 \n\t" \ - "vpaddd %%zmm6, %%zmm27, %%zmm6 \n\t" \ - "vpaddd %%zmm7, %%zmm28, %%zmm7 \n\t" \ - "vpaddd %%zmm8, %%zmm29, %%zmm8 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %6, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpaddd %%zmm9, %%zmm27, %%zmm9 \n\t" \ - "vpaddd %%zmm10, %%zmm28, %%zmm10 \n\t" \ - "vpaddd %%zmm11, %%zmm29, %%zmm11 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm30 \n\t" \ - "vpaddd %%zmm12, %%zmm27, %%zmm12 \n\t" \ - "vpaddd %%zmm13, %%zmm28, %%zmm13 \n\t" \ - "vpaddd %%zmm14, %%zmm29, %%zmm14 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %6, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpaddd %%zmm15, %%zmm27, %%zmm15 \n\t" \ - "vpaddd %%zmm16, %%zmm28, %%zmm16 \n\t" \ - "vpaddd %%zmm17, %%zmm29, %%zmm17 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm30 \n\t" \ - "vpaddd %%zmm18, %%zmm27, %%zmm18 \n\t" \ - "vpaddd %%zmm19, %%zmm28, %%zmm19 \n\t" \ - "vpaddd %%zmm20, %%zmm29, %%zmm20 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vmovups (%1), %%zmm24 \n\t" \ - "vmovups 0x40(%1), %%zmm25 \n\t" \ - "vmovups 0x80(%1), %%zmm26 \n\t" \ - "vpaddd %%zmm21, %%zmm27, %%zmm21 \n\t" \ - "vpaddd %%zmm22, %%zmm28, %%zmm22 \n\t" \ - "vpaddd %%zmm23, %%zmm29, %%zmm23 \n\t" \ - "movq %0, %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "prefetcht0 0x180(%1) \n\t" \ - "prefetcht0 0x1C0(%1) \n\t" \ - "prefetcht0 0x200(%1) \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm30 \n\t" \ - "vpaddd %%zmm0, %%zmm27, %%zmm0 \n\t" \ - "vpaddd %%zmm1, %%zmm28, %%zmm1 \n\t" \ - "vpaddd %%zmm2, %%zmm29, %%zmm2 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %6, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpaddd %%zmm3, %%zmm27, %%zmm3 \n\t" \ - "vpaddd %%zmm4, %%zmm28, %%zmm4 \n\t" \ - "vpaddd %%zmm5, %%zmm29, %%zmm5 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm30 \n\t" \ - "vpaddd %%zmm6, %%zmm27, %%zmm6 \n\t" \ - "vpaddd %%zmm7, %%zmm28, %%zmm7 \n\t" \ - "vpaddd %%zmm8, %%zmm29, %%zmm8 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %6, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpaddd %%zmm9, %%zmm27, %%zmm9 \n\t" \ - "vpaddd %%zmm10, %%zmm28, %%zmm10 \n\t" \ - "vpaddd %%zmm11, %%zmm29, %%zmm11 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm30 \n\t" \ - "vpaddd %%zmm12, %%zmm27, %%zmm12 \n\t" \ - "vpaddd %%zmm13, %%zmm28, %%zmm13 \n\t" \ - "vpaddd %%zmm14, %%zmm29, %%zmm14 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %6, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpaddd %%zmm15, %%zmm27, %%zmm15 \n\t" \ - "vpaddd %%zmm16, %%zmm28, %%zmm16 \n\t" \ - "vpaddd %%zmm17, %%zmm29, %%zmm17 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm30 \n\t" \ - "vpaddd %%zmm18, %%zmm27, %%zmm18 \n\t" \ - "vpaddd %%zmm19, %%zmm28, %%zmm19 \n\t" \ - "vpaddd %%zmm20, %%zmm29, %%zmm20 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vmovups 0xC0(%1), %%zmm24 \n\t" \ - "vmovups 0x100(%1), %%zmm25 \n\t" \ - "vmovups 0x140(%1), %%zmm26 \n\t" \ - "vpaddd %%zmm21, %%zmm27, %%zmm21 \n\t" \ - "vpaddd %%zmm22, %%zmm28, %%zmm22 \n\t" \ - "vpaddd %%zmm23, %%zmm29, %%zmm23 \n\t" -#endif - -inline void mmm_avx512_8x48_asm(U32 um, - U32 un, - U32 bk, - UINT8 *matrixA, - INT8 *matrixB, - I32 *matrixC, - UINT8 *u8Result, - I32 *offsetC, - U32 N, - U32 stepK, - const F32 *scale, - U32 flags) -{ - __asm__ __volatile__( - "prefetcht0 0xC0(%1) \n\t" - "prefetcht0 0x100(%1) \n\t" - "prefetcht0 0x140(%1) \n\t" - "vmovups (%1), %%zmm24 \n\t" - "vmovups 0x40(%1), %%zmm25 \n\t" - "vmovups 0x80(%1), %%zmm26 \n\t" - "add $0xC0, %1 \n\t" -#ifndef _USE_AVX512_VNNI - "mov $1, %%eax \n\t" - "vmovd %%eax, %%xmm0 \n\t" - "vpbroadcastw %%xmm0, %%zmm31 \n\t" -#endif - - "movq %%rbx, %%rax \n\t" - "andq $0x1, %%rax \n\t" - "jne 0f \n\t" - "vmovups (%7), %%zmm0 \n\t" - "vmovups 0x40(%7), %%zmm1 \n\t" - "vmovups 0x80(%7), %%zmm2 \n\t" - "vmovups %%zmm0, %%zmm3 \n\t" - "vmovups %%zmm1, %%zmm4 \n\t" - "vmovups %%zmm2, %%zmm5 \n\t" - "vmovups %%zmm0, %%zmm6 \n\t" - "vmovups %%zmm1, %%zmm7 \n\t" - "vmovups %%zmm2, %%zmm8 \n\t" - "vmovups %%zmm0, %%zmm9 \n\t" - "vmovups %%zmm1, %%zmm10 \n\t" - "vmovups %%zmm2, %%zmm11 \n\t" - "vmovups %%zmm0, %%zmm12 \n\t" - "vmovups %%zmm1, %%zmm13 \n\t" - "vmovups %%zmm2, %%zmm14 \n\t" - "vmovups %%zmm0, %%zmm15 \n\t" - "vmovups %%zmm1, %%zmm16 \n\t" - "vmovups %%zmm2, %%zmm17 \n\t" - "vmovups %%zmm0, %%zmm18 \n\t" - "vmovups %%zmm1, %%zmm19 \n\t" - "vmovups %%zmm2, %%zmm20 \n\t" - "vmovups %%zmm0, %%zmm21 \n\t" - "vmovups %%zmm1, %%zmm22 \n\t" - "vmovups %%zmm2, %%zmm23 \n\t" - "jmp 1f \n\t" - - ".align 16 \n\t" - "0: \n\t" - "vxorps %%zmm0, %%zmm0, %%zmm0 \n\t" - "vxorps %%zmm1, %%zmm1, %%zmm1 \n\t" - "vxorps %%zmm2, %%zmm2, %%zmm2 \n\t" - "vxorps %%zmm3, %%zmm3, %%zmm3 \n\t" - "vxorps %%zmm4, %%zmm4, %%zmm4 \n\t" - "vxorps %%zmm5, %%zmm5, %%zmm5 \n\t" - "vxorps %%zmm6, %%zmm6, %%zmm6 \n\t" - "vxorps %%zmm7, %%zmm7, %%zmm7 \n\t" - "vxorps %%zmm8, %%zmm8, %%zmm8 \n\t" - "vxorps %%zmm9, %%zmm9, %%zmm9 \n\t" - "vxorps %%zmm10, %%zmm10, %%zmm10 \n\t" - "vxorps %%zmm11, %%zmm11, %%zmm11 \n\t" - "vxorps %%zmm12, %%zmm12, %%zmm12 \n\t" - "vxorps %%zmm13, %%zmm13, %%zmm13 \n\t" - "vxorps %%zmm14, %%zmm14, %%zmm14 \n\t" - "vxorps %%zmm15, %%zmm15, %%zmm15 \n\t" - "vxorps %%zmm16, %%zmm16, %%zmm16 \n\t" - "vxorps %%zmm17, %%zmm17, %%zmm17 \n\t" - "vxorps %%zmm18, %%zmm18, %%zmm18 \n\t" - "vxorps %%zmm19, %%zmm19, %%zmm19 \n\t" - "vxorps %%zmm20, %%zmm20, %%zmm20 \n\t" - "vxorps %%zmm21, %%zmm21, %%zmm21 \n\t" - "vxorps %%zmm22, %%zmm22, %%zmm22 \n\t" - "vxorps %%zmm23, %%zmm23, %%zmm23 \n\t" - - ".align 16 \n\t" - "1: \n\t" - "movq %2, %%rax \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 0x40(%%rax) \n\t" - "prefetcht0 0x80(%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "prefetcht0 0x40(%%rax, %4) \n\t" - "prefetcht0 0x80(%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 0x40(%%rax) \n\t" - "prefetcht0 0x80(%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "prefetcht0 0x40(%%rax, %4) \n\t" - "prefetcht0 0x80(%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 0x40(%%rax) \n\t" - "prefetcht0 0x80(%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "prefetcht0 0x40(%%rax, %4) \n\t" - "prefetcht0 0x80(%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 0x40(%%rax) \n\t" - "prefetcht0 0x80(%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "prefetcht0 0x40(%%rax, %4) \n\t" - "prefetcht0 0x80(%%rax, %4) \n\t" - - ".align 16 \n\t" - "2: \n\t" mmmKernel8x48 - - "add $0x180, %1 \n\t" - "add $0x8, %0 \n\t" - "dec %%rcx \n\t" - "jg 2b \n\t" - - "movq %2, %%rax \n\t" - "movq %4, %%rcx \n\t" - "addq %4, %%rcx \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd 0x40(%%rax), %%zmm1, %%zmm1 \n\t" - "vpaddd 0x80(%%rax), %%zmm2, %%zmm2 \n\t" - "vpaddd (%%rax, %4), %%zmm3, %%zmm3 \n\t" - "vpaddd 0x40(%%rax, %4), %%zmm4, %%zmm4 \n\t" - "vpaddd 0x80(%%rax, %4), %%zmm5, %%zmm5 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm6, %%zmm6 \n\t" - "vpaddd 0x40(%%rax), %%zmm7, %%zmm7 \n\t" - "vpaddd 0x80(%%rax), %%zmm8, %%zmm8 \n\t" - "vpaddd (%%rax, %4), %%zmm9, %%zmm9 \n\t" - "vpaddd 0x40(%%rax, %4), %%zmm10, %%zmm10 \n\t" - "vpaddd 0x80(%%rax, %4), %%zmm11, %%zmm11 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm12, %%zmm12 \n\t" - "vpaddd 0x40(%%rax), %%zmm13, %%zmm13 \n\t" - "vpaddd 0x80(%%rax), %%zmm14, %%zmm14 \n\t" - "vpaddd (%%rax, %4), %%zmm15, %%zmm15 \n\t" - "vpaddd 0x40(%%rax, %4), %%zmm16, %%zmm16 \n\t" - "vpaddd 0x80(%%rax, %4), %%zmm17, %%zmm17 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm18, %%zmm18 \n\t" - "vpaddd 0x40(%%rax), %%zmm19, %%zmm19 \n\t" - "vpaddd 0x80(%%rax), %%zmm20, %%zmm20 \n\t" - "vpaddd (%%rax, %4), %%zmm21, %%zmm21 \n\t" - "vpaddd 0x40(%%rax, %4), %%zmm22, %%zmm22 \n\t" - "vpaddd 0x80(%%rax, %4), %%zmm23, %%zmm23 \n\t" - - "cmpq $0x0, %5 \n\t" - "je 3f \n\t" - - "vbroadcastss (%5), %%zmm24 \n\t" - "vcvtdq2ps %%zmm0, %%zmm0 \n\t" - "vcvtdq2ps %%zmm1, %%zmm1 \n\t" - "vcvtdq2ps %%zmm2, %%zmm2 \n\t" - "vcvtdq2ps %%zmm3, %%zmm3 \n\t" - "vcvtdq2ps %%zmm4, %%zmm4 \n\t" - "vcvtdq2ps %%zmm5, %%zmm5 \n\t" - "vcvtdq2ps %%zmm6, %%zmm6 \n\t" - "vcvtdq2ps %%zmm7, %%zmm7 \n\t" - "vcvtdq2ps %%zmm8, %%zmm8 \n\t" - "vcvtdq2ps %%zmm9, %%zmm9 \n\t" - "vcvtdq2ps %%zmm10, %%zmm10 \n\t" - "vcvtdq2ps %%zmm11, %%zmm11 \n\t" - "vcvtdq2ps %%zmm12, %%zmm12 \n\t" - "vcvtdq2ps %%zmm13, %%zmm13 \n\t" - "vcvtdq2ps %%zmm14, %%zmm14 \n\t" - "vcvtdq2ps %%zmm15, %%zmm15 \n\t" - "vcvtdq2ps %%zmm16, %%zmm16 \n\t" - "vcvtdq2ps %%zmm17, %%zmm17 \n\t" - "vcvtdq2ps %%zmm18, %%zmm18 \n\t" - "vcvtdq2ps %%zmm19, %%zmm19 \n\t" - "vcvtdq2ps %%zmm20, %%zmm20 \n\t" - "vcvtdq2ps %%zmm21, %%zmm21 \n\t" - "vcvtdq2ps %%zmm22, %%zmm22 \n\t" - "vcvtdq2ps %%zmm23, %%zmm23 \n\t" - "vmulps %%zmm0, %%zmm24, %%zmm0 \n\t" - "vmulps %%zmm1, %%zmm24, %%zmm1 \n\t" - "vmulps %%zmm2, %%zmm24, %%zmm2 \n\t" - "vmulps %%zmm3, %%zmm24, %%zmm3 \n\t" - "vmulps %%zmm4, %%zmm24, %%zmm4 \n\t" - "vmulps %%zmm5, %%zmm24, %%zmm5 \n\t" - "vmulps %%zmm6, %%zmm24, %%zmm6 \n\t" - "vmulps %%zmm7, %%zmm24, %%zmm7 \n\t" - "vmulps %%zmm8, %%zmm24, %%zmm8 \n\t" - "vmulps %%zmm9, %%zmm24, %%zmm9 \n\t" - "vmulps %%zmm10, %%zmm24, %%zmm10 \n\t" - "vmulps %%zmm11, %%zmm24, %%zmm11 \n\t" - "vmulps %%zmm12, %%zmm24, %%zmm12 \n\t" - "vmulps %%zmm13, %%zmm24, %%zmm13 \n\t" - "vmulps %%zmm14, %%zmm24, %%zmm14 \n\t" - "vmulps %%zmm15, %%zmm24, %%zmm15 \n\t" - "vmulps %%zmm16, %%zmm24, %%zmm16 \n\t" - "vmulps %%zmm17, %%zmm24, %%zmm17 \n\t" - "vmulps %%zmm18, %%zmm24, %%zmm18 \n\t" - "vmulps %%zmm19, %%zmm24, %%zmm19 \n\t" - "vmulps %%zmm20, %%zmm24, %%zmm20 \n\t" - "vmulps %%zmm21, %%zmm24, %%zmm21 \n\t" - "vmulps %%zmm22, %%zmm24, %%zmm22 \n\t" - "vmulps %%zmm23, %%zmm24, %%zmm23 \n\t" - - "movq %%rbx, %%rax \n\t" - "andq $0x2, %%rax \n\t" - "je 3f \n\t" - "vcvtps2dq %%zmm0, %%zmm0 \n\t" - "vcvtps2dq %%zmm1, %%zmm1 \n\t" - "vcvtps2dq %%zmm2, %%zmm2 \n\t" - "vcvtps2dq %%zmm3, %%zmm3 \n\t" - "vcvtps2dq %%zmm4, %%zmm4 \n\t" - "vcvtps2dq %%zmm5, %%zmm5 \n\t" - "vcvtps2dq %%zmm6, %%zmm6 \n\t" - "vcvtps2dq %%zmm7, %%zmm7 \n\t" - "vcvtps2dq %%zmm8, %%zmm8 \n\t" - "vcvtps2dq %%zmm9, %%zmm9 \n\t" - "vcvtps2dq %%zmm10, %%zmm10 \n\t" - "vcvtps2dq %%zmm11, %%zmm11 \n\t" - "vcvtps2dq %%zmm12, %%zmm12 \n\t" - "vcvtps2dq %%zmm13, %%zmm13 \n\t" - "vcvtps2dq %%zmm14, %%zmm14 \n\t" - "vcvtps2dq %%zmm15, %%zmm15 \n\t" - "vcvtps2dq %%zmm16, %%zmm16 \n\t" - "vcvtps2dq %%zmm17, %%zmm17 \n\t" - "vcvtps2dq %%zmm18, %%zmm18 \n\t" - "vcvtps2dq %%zmm19, %%zmm19 \n\t" - "vcvtps2dq %%zmm20, %%zmm20 \n\t" - "vcvtps2dq %%zmm21, %%zmm21 \n\t" - "vcvtps2dq %%zmm22, %%zmm22 \n\t" - "vcvtps2dq %%zmm23, %%zmm23 \n\t" - "mov $128, %%eax \n\t" - "vmovd %%eax, %%xmm25 \n\t" - "vbroadcastss %%xmm25, %%zmm24 \n\t" - "vpaddd %%zmm0, %%zmm24, %%zmm0 \n\t" - "vpaddd %%zmm1, %%zmm24, %%zmm1 \n\t" - "vpaddd %%zmm2, %%zmm24, %%zmm2 \n\t" - "vpaddd %%zmm3, %%zmm24, %%zmm3 \n\t" - "vpaddd %%zmm4, %%zmm24, %%zmm4 \n\t" - "vpaddd %%zmm5, %%zmm24, %%zmm5 \n\t" - "vpaddd %%zmm6, %%zmm24, %%zmm6 \n\t" - "vpaddd %%zmm7, %%zmm24, %%zmm7 \n\t" - "vpaddd %%zmm8, %%zmm24, %%zmm8 \n\t" - "vpaddd %%zmm9, %%zmm24, %%zmm9 \n\t" - "vpaddd %%zmm10, %%zmm24, %%zmm10 \n\t" - "vpaddd %%zmm11, %%zmm24, %%zmm11 \n\t" - "vpaddd %%zmm12, %%zmm24, %%zmm12 \n\t" - "vpaddd %%zmm13, %%zmm24, %%zmm13 \n\t" - "vpaddd %%zmm14, %%zmm24, %%zmm14 \n\t" - "vpaddd %%zmm15, %%zmm24, %%zmm15 \n\t" - "vpaddd %%zmm16, %%zmm24, %%zmm16 \n\t" - "vpaddd %%zmm17, %%zmm24, %%zmm17 \n\t" - "vpaddd %%zmm18, %%zmm24, %%zmm18 \n\t" - "vpaddd %%zmm19, %%zmm24, %%zmm19 \n\t" - "vpaddd %%zmm20, %%zmm24, %%zmm20 \n\t" - "vpaddd %%zmm21, %%zmm24, %%zmm21 \n\t" - "vpaddd %%zmm22, %%zmm24, %%zmm22 \n\t" - "vpaddd %%zmm23, %%zmm24, %%zmm23 \n\t" - "movq %9, %%rax \n\t" - "shr $2, %4 \n\t" - "movq %4, %%rcx \n\t" - "addq %4, %%rcx \n\t" - "vpmovusdb %%zmm0, (%%rax) \n\t" - "vpmovusdb %%zmm1, 0x10(%%rax) \n\t" - "vpmovusdb %%zmm2, 0x20(%%rax) \n\t" - "vpmovusdb %%zmm3, (%%rax, %4) \n\t" - "vpmovusdb %%zmm4, 0x10(%%rax, %4) \n\t" - "vpmovusdb %%zmm5, 0x20(%%rax, %4) \n\t" - "add %%rcx, %%rax \n\t" - "vpmovusdb %%zmm6, (%%rax) \n\t" - "vpmovusdb %%zmm7, 0x10(%%rax) \n\t" - "vpmovusdb %%zmm8, 0x20(%%rax) \n\t" - "vpmovusdb %%zmm9, (%%rax, %4) \n\t" - "vpmovusdb %%zmm10, 0x10(%%rax, %4) \n\t" - "vpmovusdb %%zmm11, 0x20(%%rax, %4) \n\t" - "add %%rcx, %%rax \n\t" - "vpmovusdb %%zmm12, (%%rax) \n\t" - "vpmovusdb %%zmm13, 0x10(%%rax) \n\t" - "vpmovusdb %%zmm14, 0x20(%%rax) \n\t" - "vpmovusdb %%zmm15, (%%rax, %4) \n\t" - "vpmovusdb %%zmm16, 0x10(%%rax, %4) \n\t" - "vpmovusdb %%zmm17, 0x20(%%rax, %4) \n\t" - "add %%rcx, %%rax \n\t" - "vpmovusdb %%zmm18, (%%rax) \n\t" - "vpmovusdb %%zmm19, 0x10(%%rax) \n\t" - "vpmovusdb %%zmm20, 0x20(%%rax) \n\t" - "vpmovusdb %%zmm21, (%%rax, %4) \n\t" - "vpmovusdb %%zmm22, 0x10(%%rax, %4) \n\t" - "vpmovusdb %%zmm23, 0x20(%%rax, %4) \n\t" - "jmp 4f \n\t" - - ".align 16 \n\t" - "3: \n\t" - "movq %2, %%rax \n\t" - "vmovups %%zmm0, (%%rax) \n\t" - "vmovups %%zmm1, 0x40(%%rax) \n\t" - "vmovups %%zmm2, 0x80(%%rax) \n\t" - "vmovups %%zmm3, (%%rax, %4) \n\t" - "vmovups %%zmm4, 0x40(%%rax, %4) \n\t" - "vmovups %%zmm5, 0x80(%%rax, %4) \n\t" - "add %%rcx, %%rax \n\t" - "vmovups %%zmm6, (%%rax) \n\t" - "vmovups %%zmm7, 0x40(%%rax) \n\t" - "vmovups %%zmm8, 0x80(%%rax) \n\t" - "vmovups %%zmm9, (%%rax, %4) \n\t" - "vmovups %%zmm10, 0x40(%%rax, %4) \n\t" - "vmovups %%zmm11, 0x80(%%rax, %4) \n\t" - "add %%rcx, %%rax \n\t" - "vmovups %%zmm12, (%%rax) \n\t" - "vmovups %%zmm13, 0x40(%%rax) \n\t" - "vmovups %%zmm14, 0x80(%%rax) \n\t" - "vmovups %%zmm15, (%%rax, %4) \n\t" - "vmovups %%zmm16, 0x40(%%rax, %4) \n\t" - "vmovups %%zmm17, 0x80(%%rax, %4) \n\t" - "add %%rcx, %%rax \n\t" - "vmovups %%zmm18, (%%rax) \n\t" - "vmovups %%zmm19, 0x40(%%rax) \n\t" - "vmovups %%zmm20, 0x80(%%rax) \n\t" - "vmovups %%zmm21, (%%rax, %4) \n\t" - "vmovups %%zmm22, 0x40(%%rax, %4) \n\t" - "vmovups %%zmm23, 0x80(%%rax, %4) \n\t" - ".align 16 \n\t" - "4: \n\t" - : - : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((long long)(N * 4)), - "r"(scale), "r"((int64_t)stepK), "r"(offsetC), "b"((int64_t)flags), "r"(u8Result) - : "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", - "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", - "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", - "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31", "memory", "cc"); -} - -#ifdef _USE_AVX512_VNNI -#define mmmKernel12x32 \ - "movq %0, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %%rbx), %%zmm31 \n\t" \ - "prefetcht0 0x80(%1) \n\t" \ - "prefetcht0 0xC0(%1) \n\t" \ - "vpdpbusd %%zmm24, %%zmm28, %%zmm0 \n\t" \ - "vpdpbusd %%zmm25, %%zmm28, %%zmm1 \n\t" \ - "vpdpbusd %%zmm24, %%zmm29, %%zmm2 \n\t" \ - "vpdpbusd %%zmm25, %%zmm29, %%zmm3 \n\t" \ - "vpdpbusd %%zmm24, %%zmm30, %%zmm4 \n\t" \ - "vpdpbusd %%zmm25, %%zmm30, %%zmm5 \n\t" \ - "vpdpbusd %%zmm24, %%zmm31, %%zmm6 \n\t" \ - "vpdpbusd %%zmm25, %%zmm31, %%zmm7 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %%rbx), %%zmm31 \n\t" \ - "vmovups (%1), %%zmm26 \n\t" \ - "vpdpbusd %%zmm24, %%zmm28, %%zmm8 \n\t" \ - "vpdpbusd %%zmm25, %%zmm28, %%zmm9 \n\t" \ - "vpdpbusd %%zmm24, %%zmm29, %%zmm10 \n\t" \ - "vpdpbusd %%zmm25, %%zmm29, %%zmm11 \n\t" \ - "vpdpbusd %%zmm24, %%zmm30, %%zmm12 \n\t" \ - "vpdpbusd %%zmm25, %%zmm30, %%zmm13 \n\t" \ - "vpdpbusd %%zmm24, %%zmm31, %%zmm14 \n\t" \ - "vpdpbusd %%zmm25, %%zmm31, %%zmm15 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %%rbx), %%zmm31 \n\t" \ - "vmovups 0x40(%1), %%zmm27 \n\t" \ - "vpdpbusd %%zmm24, %%zmm28, %%zmm16 \n\t" \ - "vpdpbusd %%zmm25, %%zmm28, %%zmm17 \n\t" \ - "vpdpbusd %%zmm24, %%zmm29, %%zmm18 \n\t" \ - "vpdpbusd %%zmm25, %%zmm29, %%zmm19 \n\t" \ - "vpdpbusd %%zmm24, %%zmm30, %%zmm20 \n\t" \ - "vpdpbusd %%zmm25, %%zmm30, %%zmm21 \n\t" \ - "vpdpbusd %%zmm24, %%zmm31, %%zmm22 \n\t" \ - "vpdpbusd %%zmm25, %%zmm31, %%zmm23 \n\t" \ - "movq %0, %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %%rbx), %%zmm31 \n\t" \ - "prefetcht0 0x100(%1) \n\t" \ - "prefetcht0 0x140(%1) \n\t" \ - "vpdpbusd %%zmm26, %%zmm28, %%zmm0 \n\t" \ - "vpdpbusd %%zmm27, %%zmm28, %%zmm1 \n\t" \ - "vpdpbusd %%zmm26, %%zmm29, %%zmm2 \n\t" \ - "vpdpbusd %%zmm27, %%zmm29, %%zmm3 \n\t" \ - "vpdpbusd %%zmm26, %%zmm30, %%zmm4 \n\t" \ - "vpdpbusd %%zmm27, %%zmm30, %%zmm5 \n\t" \ - "vpdpbusd %%zmm26, %%zmm31, %%zmm6 \n\t" \ - "vpdpbusd %%zmm27, %%zmm31, %%zmm7 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %%rbx), %%zmm31 \n\t" \ - "vmovups 0x80(%1), %%zmm24 \n\t" \ - "vpdpbusd %%zmm26, %%zmm28, %%zmm8 \n\t" \ - "vpdpbusd %%zmm27, %%zmm28, %%zmm9 \n\t" \ - "vpdpbusd %%zmm26, %%zmm29, %%zmm10 \n\t" \ - "vpdpbusd %%zmm27, %%zmm29, %%zmm11 \n\t" \ - "vpdpbusd %%zmm26, %%zmm30, %%zmm12 \n\t" \ - "vpdpbusd %%zmm27, %%zmm30, %%zmm13 \n\t" \ - "vpdpbusd %%zmm26, %%zmm31, %%zmm14 \n\t" \ - "vpdpbusd %%zmm27, %%zmm31, %%zmm15 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %%rbx), %%zmm31 \n\t" \ - "vmovups 0xC0(%1), %%zmm25 \n\t" \ - "vpdpbusd %%zmm26, %%zmm28, %%zmm16 \n\t" \ - "vpdpbusd %%zmm27, %%zmm28, %%zmm17 \n\t" \ - "vpdpbusd %%zmm26, %%zmm29, %%zmm18 \n\t" \ - "vpdpbusd %%zmm27, %%zmm29, %%zmm19 \n\t" \ - "vpdpbusd %%zmm26, %%zmm30, %%zmm20 \n\t" \ - "vpdpbusd %%zmm27, %%zmm30, %%zmm21 \n\t" \ - "vpdpbusd %%zmm26, %%zmm31, %%zmm22 \n\t" \ - "vpdpbusd %%zmm27, %%zmm31, %%zmm23 \n\t" -#else -#define mmmKernel12x32 \ - "movq %0, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "prefetcht0 0x80(%1) \n\t" \ - "prefetcht0 0xC0(%1) \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm29, %%zmm28 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpaddd %%zmm0, %%zmm26, %%zmm0 \n\t" \ - "vpaddd %%zmm1, %%zmm27, %%zmm1 \n\t" \ - "vpaddd %%zmm2, %%zmm28, %%zmm2 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm29, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %%rbx), %%zmm29 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpaddd %%zmm3, %%zmm26, %%zmm3 \n\t" \ - "vpaddd %%zmm4, %%zmm27, %%zmm4 \n\t" \ - "vpaddd %%zmm5, %%zmm28, %%zmm5 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm29, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm29, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm28 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpaddd %%zmm6, %%zmm26, %%zmm6 \n\t" \ - "vpaddd %%zmm7, %%zmm27, %%zmm7 \n\t" \ - "vpaddd %%zmm8, %%zmm28, %%zmm8 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm29, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm29, %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %%rbx), %%zmm29 \n\t" \ - "vpaddd %%zmm9, %%zmm26, %%zmm9 \n\t" \ - "vpaddd %%zmm10, %%zmm27, %%zmm10 \n\t" \ - "vpaddd %%zmm11, %%zmm28, %%zmm11 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm29, %%zmm28 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpaddd %%zmm12, %%zmm26, %%zmm12 \n\t" \ - "vpaddd %%zmm13, %%zmm27, %%zmm13 \n\t" \ - "vpaddd %%zmm14, %%zmm28, %%zmm14 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm29, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpaddd %%zmm15, %%zmm26, %%zmm15 \n\t" \ - "vpaddd %%zmm16, %%zmm27, %%zmm16 \n\t" \ - "vpaddd %%zmm17, %%zmm28, %%zmm17 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm29, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm29, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm28 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %%rbx), %%zmm29 \n\t" \ - "vpaddd %%zmm18, %%zmm26, %%zmm18 \n\t" \ - "vpaddd %%zmm19, %%zmm27, %%zmm19 \n\t" \ - "vpaddd %%zmm20, %%zmm28, %%zmm20 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm29, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm29, %%zmm28 \n\t" \ - "movq %0, %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vmovups (%1), %%zmm24 \n\t" \ - "vmovups 0x40(%1), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpaddd %%zmm21, %%zmm26, %%zmm21 \n\t" \ - "vpaddd %%zmm22, %%zmm27, %%zmm22 \n\t" \ - "vpaddd %%zmm23, %%zmm28, %%zmm23 \n\t" \ - "prefetcht0 0x100(%1) \n\t" \ - "prefetcht0 0x140(%1) \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm29, %%zmm28 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpaddd %%zmm0, %%zmm26, %%zmm0 \n\t" \ - "vpaddd %%zmm1, %%zmm27, %%zmm1 \n\t" \ - "vpaddd %%zmm2, %%zmm28, %%zmm2 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm29, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %%rbx), %%zmm29 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpaddd %%zmm3, %%zmm26, %%zmm3 \n\t" \ - "vpaddd %%zmm4, %%zmm27, %%zmm4 \n\t" \ - "vpaddd %%zmm5, %%zmm28, %%zmm5 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm29, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm29, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm28 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpaddd %%zmm6, %%zmm26, %%zmm6 \n\t" \ - "vpaddd %%zmm7, %%zmm27, %%zmm7 \n\t" \ - "vpaddd %%zmm8, %%zmm28, %%zmm8 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm29, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm29, %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %%rbx), %%zmm29 \n\t" \ - "vpaddd %%zmm9, %%zmm26, %%zmm9 \n\t" \ - "vpaddd %%zmm10, %%zmm27, %%zmm10 \n\t" \ - "vpaddd %%zmm11, %%zmm28, %%zmm11 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm29, %%zmm28 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpaddd %%zmm12, %%zmm26, %%zmm12 \n\t" \ - "vpaddd %%zmm13, %%zmm27, %%zmm13 \n\t" \ - "vpaddd %%zmm14, %%zmm28, %%zmm14 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm29, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpaddd %%zmm15, %%zmm26, %%zmm15 \n\t" \ - "vpaddd %%zmm16, %%zmm27, %%zmm16 \n\t" \ - "vpaddd %%zmm17, %%zmm28, %%zmm17 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm29, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm29, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm28 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %%rbx), %%zmm29 \n\t" \ - "vpaddd %%zmm18, %%zmm26, %%zmm18 \n\t" \ - "vpaddd %%zmm19, %%zmm27, %%zmm19 \n\t" \ - "vpaddd %%zmm20, %%zmm28, %%zmm20 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm29, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm29, %%zmm28 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vmovups 0x80(%1), %%zmm24 \n\t" \ - "vmovups 0xC0(%1), %%zmm25 \n\t" \ - "vpaddd %%zmm21, %%zmm26, %%zmm21 \n\t" \ - "vpaddd %%zmm22, %%zmm27, %%zmm22 \n\t" \ - "vpaddd %%zmm23, %%zmm28, %%zmm23 \n\t" -#endif - -inline void mmm_avx512_12x32_asm(U32 um, - U32 un, - U32 bk, - UINT8 *matrixA, - INT8 *matrixB, - I32 *matrixC, - UINT8 *u8Result, - I32 *offsetC, - U32 N, - U32 stepK, - const F32 *scale, - U32 flags) -{ - __asm__ __volatile__( - "prefetcht0 0x80(%1) \n\t" - "prefetcht0 0xC0(%1) \n\t" - "vmovups (%1), %%zmm24 \n\t" - "vmovups 0x40(%1), %%zmm25 \n\t" - "add $0x80, %1 \n\t" -#ifndef _USE_AVX512_VNNI - "mov $1, %%ebx \n\t" - "vmovd %%ebx, %%xmm0 \n\t" - "vpbroadcastw %%xmm0, %%zmm31 \n\t" -#endif - - "movq %8, %%rbx \n\t" - "andq $0x1, %%rbx \n\t" - "jne 0f \n\t" - "vmovups (%7), %%zmm0 \n\t" - "vmovups 0x40(%7), %%zmm1 \n\t" - "vmovups %%zmm0, %%zmm2 \n\t" - "vmovups %%zmm1, %%zmm3 \n\t" - "vmovups %%zmm0, %%zmm4 \n\t" - "vmovups %%zmm1, %%zmm5 \n\t" - "vmovups %%zmm0, %%zmm6 \n\t" - "vmovups %%zmm1, %%zmm7 \n\t" - "vmovups %%zmm0, %%zmm8 \n\t" - "vmovups %%zmm1, %%zmm9 \n\t" - "vmovups %%zmm0, %%zmm10 \n\t" - "vmovups %%zmm1, %%zmm11 \n\t" - "vmovups %%zmm0, %%zmm12 \n\t" - "vmovups %%zmm1, %%zmm13 \n\t" - "vmovups %%zmm0, %%zmm14 \n\t" - "vmovups %%zmm1, %%zmm15 \n\t" - "vmovups %%zmm0, %%zmm16 \n\t" - "vmovups %%zmm1, %%zmm17 \n\t" - "vmovups %%zmm0, %%zmm18 \n\t" - "vmovups %%zmm1, %%zmm19 \n\t" - "vmovups %%zmm0, %%zmm20 \n\t" - "vmovups %%zmm1, %%zmm21 \n\t" - "vmovups %%zmm0, %%zmm22 \n\t" - "vmovups %%zmm1, %%zmm23 \n\t" - "jmp 1f \n\t" - ".align 16 \n\t" - "0: \n\t" - "vxorps %%zmm0, %%zmm0, %%zmm0 \n\t" - "vxorps %%zmm1, %%zmm1, %%zmm1 \n\t" - "vxorps %%zmm2, %%zmm2, %%zmm2 \n\t" - "vxorps %%zmm3, %%zmm3, %%zmm3 \n\t" - "vxorps %%zmm4, %%zmm4, %%zmm4 \n\t" - "vxorps %%zmm5, %%zmm5, %%zmm5 \n\t" - "vxorps %%zmm6, %%zmm6, %%zmm6 \n\t" - "vxorps %%zmm7, %%zmm7, %%zmm7 \n\t" - "vxorps %%zmm8, %%zmm8, %%zmm8 \n\t" - "vxorps %%zmm9, %%zmm9, %%zmm9 \n\t" - "vxorps %%zmm10, %%zmm10, %%zmm10 \n\t" - "vxorps %%zmm11, %%zmm11, %%zmm11 \n\t" - "vxorps %%zmm12, %%zmm12, %%zmm12 \n\t" - "vxorps %%zmm13, %%zmm13, %%zmm13 \n\t" - "vxorps %%zmm14, %%zmm14, %%zmm14 \n\t" - "vxorps %%zmm15, %%zmm15, %%zmm15 \n\t" - "vxorps %%zmm16, %%zmm16, %%zmm16 \n\t" - "vxorps %%zmm17, %%zmm17, %%zmm17 \n\t" - "vxorps %%zmm18, %%zmm18, %%zmm18 \n\t" - "vxorps %%zmm19, %%zmm19, %%zmm19 \n\t" - "vxorps %%zmm20, %%zmm20, %%zmm20 \n\t" - "vxorps %%zmm21, %%zmm21, %%zmm21 \n\t" - "vxorps %%zmm22, %%zmm22, %%zmm22 \n\t" - "vxorps %%zmm23, %%zmm23, %%zmm23 \n\t" - ".align 16 \n\t" - "1: \n\t" - "movq %2, %%rax \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 0x40(%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "prefetcht0 0x40(%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 0x40(%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "prefetcht0 0x40(%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 0x40(%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "prefetcht0 0x40(%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 0x40(%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "prefetcht0 0x40(%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 0x40(%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "prefetcht0 0x40(%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 0x40(%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "prefetcht0 0x40(%%rax, %4) \n\t" - "movq %6, %%rbx \n\t" - "addq %6, %%rbx \n\t" - "addq %6, %%rbx \n\t" - - ".align 16 \n\t" - "2: \n\t" mmmKernel12x32 - - "add $0x100, %1 \n\t" - "add $0x8, %0 \n\t" - "dec %%rcx \n\t" - "jg 2b \n\t" - - "movq %2, %%rax \n\t" - "movq %4, %%rcx \n\t" - "addq %4, %%rcx \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd 0x40(%%rax), %%zmm1, %%zmm1 \n\t" - "vpaddd (%%rax, %4), %%zmm2, %%zmm2 \n\t" - "vpaddd 0x40(%%rax, %4), %%zmm3, %%zmm3 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm4, %%zmm4 \n\t" - "vpaddd 0x40(%%rax), %%zmm5, %%zmm5 \n\t" - "vpaddd (%%rax, %4), %%zmm6, %%zmm6 \n\t" - "vpaddd 0x40(%%rax, %4), %%zmm7, %%zmm7 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm8, %%zmm8 \n\t" - "vpaddd 0x40(%%rax), %%zmm9, %%zmm9 \n\t" - "vpaddd (%%rax, %4), %%zmm10, %%zmm10 \n\t" - "vpaddd 0x40(%%rax, %4), %%zmm11, %%zmm11 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm12, %%zmm12 \n\t" - "vpaddd 0x40(%%rax), %%zmm13, %%zmm13 \n\t" - "vpaddd (%%rax, %4), %%zmm14, %%zmm14 \n\t" - "vpaddd 0x40(%%rax, %4), %%zmm15, %%zmm15 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm16, %%zmm16 \n\t" - "vpaddd 0x40(%%rax), %%zmm17, %%zmm17 \n\t" - "vpaddd (%%rax, %4), %%zmm18, %%zmm18 \n\t" - "vpaddd 0x40(%%rax, %4), %%zmm19, %%zmm19 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm20, %%zmm20 \n\t" - "vpaddd 0x40(%%rax), %%zmm21, %%zmm21 \n\t" - "vpaddd (%%rax, %4), %%zmm22, %%zmm22 \n\t" - "vpaddd 0x40(%%rax, %4), %%zmm23, %%zmm23 \n\t" - - "cmpq $0x0, %5 \n\t" - "je 3f \n\t" - - "vbroadcastss (%5), %%zmm24 \n\t" - "vcvtdq2ps %%zmm0, %%zmm0 \n\t" - "vcvtdq2ps %%zmm1, %%zmm1 \n\t" - "vcvtdq2ps %%zmm2, %%zmm2 \n\t" - "vcvtdq2ps %%zmm3, %%zmm3 \n\t" - "vcvtdq2ps %%zmm4, %%zmm4 \n\t" - "vcvtdq2ps %%zmm5, %%zmm5 \n\t" - "vcvtdq2ps %%zmm6, %%zmm6 \n\t" - "vcvtdq2ps %%zmm7, %%zmm7 \n\t" - "vcvtdq2ps %%zmm8, %%zmm8 \n\t" - "vcvtdq2ps %%zmm9, %%zmm9 \n\t" - "vcvtdq2ps %%zmm10, %%zmm10 \n\t" - "vcvtdq2ps %%zmm11, %%zmm11 \n\t" - "vcvtdq2ps %%zmm12, %%zmm12 \n\t" - "vcvtdq2ps %%zmm13, %%zmm13 \n\t" - "vcvtdq2ps %%zmm14, %%zmm14 \n\t" - "vcvtdq2ps %%zmm15, %%zmm15 \n\t" - "vcvtdq2ps %%zmm16, %%zmm16 \n\t" - "vcvtdq2ps %%zmm17, %%zmm17 \n\t" - "vcvtdq2ps %%zmm18, %%zmm18 \n\t" - "vcvtdq2ps %%zmm19, %%zmm19 \n\t" - "vcvtdq2ps %%zmm20, %%zmm20 \n\t" - "vcvtdq2ps %%zmm21, %%zmm21 \n\t" - "vcvtdq2ps %%zmm22, %%zmm22 \n\t" - "vcvtdq2ps %%zmm23, %%zmm23 \n\t" - "vmulps %%zmm0, %%zmm24, %%zmm0 \n\t" - "vmulps %%zmm1, %%zmm24, %%zmm1 \n\t" - "vmulps %%zmm2, %%zmm24, %%zmm2 \n\t" - "vmulps %%zmm3, %%zmm24, %%zmm3 \n\t" - "vmulps %%zmm4, %%zmm24, %%zmm4 \n\t" - "vmulps %%zmm5, %%zmm24, %%zmm5 \n\t" - "vmulps %%zmm6, %%zmm24, %%zmm6 \n\t" - "vmulps %%zmm7, %%zmm24, %%zmm7 \n\t" - "vmulps %%zmm8, %%zmm24, %%zmm8 \n\t" - "vmulps %%zmm9, %%zmm24, %%zmm9 \n\t" - "vmulps %%zmm10, %%zmm24, %%zmm10 \n\t" - "vmulps %%zmm11, %%zmm24, %%zmm11 \n\t" - "vmulps %%zmm12, %%zmm24, %%zmm12 \n\t" - "vmulps %%zmm13, %%zmm24, %%zmm13 \n\t" - "vmulps %%zmm14, %%zmm24, %%zmm14 \n\t" - "vmulps %%zmm15, %%zmm24, %%zmm15 \n\t" - "vmulps %%zmm16, %%zmm24, %%zmm16 \n\t" - "vmulps %%zmm17, %%zmm24, %%zmm17 \n\t" - "vmulps %%zmm18, %%zmm24, %%zmm18 \n\t" - "vmulps %%zmm19, %%zmm24, %%zmm19 \n\t" - "vmulps %%zmm20, %%zmm24, %%zmm20 \n\t" - "vmulps %%zmm21, %%zmm24, %%zmm21 \n\t" - "vmulps %%zmm22, %%zmm24, %%zmm22 \n\t" - "vmulps %%zmm23, %%zmm24, %%zmm23 \n\t" - - "movq %8, %%rbx \n\t" - "andq $0x2, %%rbx \n\t" - "je 3f \n\t" - "vcvtps2dq %%zmm0, %%zmm0 \n\t" - "vcvtps2dq %%zmm1, %%zmm1 \n\t" - "vcvtps2dq %%zmm2, %%zmm2 \n\t" - "vcvtps2dq %%zmm3, %%zmm3 \n\t" - "vcvtps2dq %%zmm4, %%zmm4 \n\t" - "vcvtps2dq %%zmm5, %%zmm5 \n\t" - "vcvtps2dq %%zmm6, %%zmm6 \n\t" - "vcvtps2dq %%zmm7, %%zmm7 \n\t" - "vcvtps2dq %%zmm8, %%zmm8 \n\t" - "vcvtps2dq %%zmm9, %%zmm9 \n\t" - "vcvtps2dq %%zmm10, %%zmm10 \n\t" - "vcvtps2dq %%zmm11, %%zmm11 \n\t" - "vcvtps2dq %%zmm12, %%zmm12 \n\t" - "vcvtps2dq %%zmm13, %%zmm13 \n\t" - "vcvtps2dq %%zmm14, %%zmm14 \n\t" - "vcvtps2dq %%zmm15, %%zmm15 \n\t" - "vcvtps2dq %%zmm16, %%zmm16 \n\t" - "vcvtps2dq %%zmm17, %%zmm17 \n\t" - "vcvtps2dq %%zmm18, %%zmm18 \n\t" - "vcvtps2dq %%zmm19, %%zmm19 \n\t" - "vcvtps2dq %%zmm20, %%zmm20 \n\t" - "vcvtps2dq %%zmm21, %%zmm21 \n\t" - "vcvtps2dq %%zmm22, %%zmm22 \n\t" - "vcvtps2dq %%zmm23, %%zmm23 \n\t" - "mov $128, %%eax \n\t" - "vmovd %%eax, %%xmm25 \n\t" - "vbroadcastss %%xmm25, %%zmm24 \n\t" - "vpaddd %%zmm0, %%zmm24, %%zmm0 \n\t" - "vpaddd %%zmm1, %%zmm24, %%zmm1 \n\t" - "vpaddd %%zmm2, %%zmm24, %%zmm2 \n\t" - "vpaddd %%zmm3, %%zmm24, %%zmm3 \n\t" - "vpaddd %%zmm4, %%zmm24, %%zmm4 \n\t" - "vpaddd %%zmm5, %%zmm24, %%zmm5 \n\t" - "vpaddd %%zmm6, %%zmm24, %%zmm6 \n\t" - "vpaddd %%zmm7, %%zmm24, %%zmm7 \n\t" - "vpaddd %%zmm8, %%zmm24, %%zmm8 \n\t" - "vpaddd %%zmm9, %%zmm24, %%zmm9 \n\t" - "vpaddd %%zmm10, %%zmm24, %%zmm10 \n\t" - "vpaddd %%zmm11, %%zmm24, %%zmm11 \n\t" - "vpaddd %%zmm12, %%zmm24, %%zmm12 \n\t" - "vpaddd %%zmm13, %%zmm24, %%zmm13 \n\t" - "vpaddd %%zmm14, %%zmm24, %%zmm14 \n\t" - "vpaddd %%zmm15, %%zmm24, %%zmm15 \n\t" - "vpaddd %%zmm16, %%zmm24, %%zmm16 \n\t" - "vpaddd %%zmm17, %%zmm24, %%zmm17 \n\t" - "vpaddd %%zmm18, %%zmm24, %%zmm18 \n\t" - "vpaddd %%zmm19, %%zmm24, %%zmm19 \n\t" - "vpaddd %%zmm20, %%zmm24, %%zmm20 \n\t" - "vpaddd %%zmm21, %%zmm24, %%zmm21 \n\t" - "vpaddd %%zmm22, %%zmm24, %%zmm22 \n\t" - "vpaddd %%zmm23, %%zmm24, %%zmm23 \n\t" - "movq %9, %%rax \n\t" - "shr $2, %4 \n\t" - "movq %4, %%rcx \n\t" - "addq %4, %%rcx \n\t" - "vpmovusdb %%zmm0, (%%rax) \n\t" - "vpmovusdb %%zmm1, 0x10(%%rax) \n\t" - "vpmovusdb %%zmm2, (%%rax, %4) \n\t" - "vpmovusdb %%zmm3, 0x10(%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm4, (%%rax) \n\t" - "vpmovusdb %%zmm5, 0x10(%%rax) \n\t" - "vpmovusdb %%zmm6, (%%rax, %4) \n\t" - "vpmovusdb %%zmm7, 0x10(%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm8, (%%rax) \n\t" - "vpmovusdb %%zmm9, 0x10(%%rax) \n\t" - "vpmovusdb %%zmm10, (%%rax, %4) \n\t" - "vpmovusdb %%zmm11, 0x10(%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm12, (%%rax) \n\t" - "vpmovusdb %%zmm13, 0x10(%%rax) \n\t" - "vpmovusdb %%zmm14, (%%rax, %4) \n\t" - "vpmovusdb %%zmm15, 0x10(%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm16, (%%rax) \n\t" - "vpmovusdb %%zmm17, 0x10(%%rax) \n\t" - "vpmovusdb %%zmm18, (%%rax, %4) \n\t" - "vpmovusdb %%zmm19, 0x10(%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm20, (%%rax) \n\t" - "vpmovusdb %%zmm21, 0x10(%%rax) \n\t" - "vpmovusdb %%zmm22, (%%rax, %4) \n\t" - "vpmovusdb %%zmm23, 0x10(%%rax, %4) \n\t" - "jmp 4f \n\t" - - ".align 16 \n\t" - "3: \n\t" - "movq %2, %%rax \n\t" - "vmovups %%zmm0, (%%rax) \n\t" - "vmovups %%zmm1, 0x40(%%rax) \n\t" - "vmovups %%zmm2, (%%rax, %4) \n\t" - "vmovups %%zmm3, 0x40(%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm4, (%%rax) \n\t" - "vmovups %%zmm5, 0x40(%%rax) \n\t" - "vmovups %%zmm6, (%%rax, %4) \n\t" - "vmovups %%zmm7, 0x40(%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm8, (%%rax) \n\t" - "vmovups %%zmm9, 0x40(%%rax) \n\t" - "vmovups %%zmm10, (%%rax, %4) \n\t" - "vmovups %%zmm11, 0x40(%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm12, (%%rax) \n\t" - "vmovups %%zmm13, 0x40(%%rax) \n\t" - "vmovups %%zmm14, (%%rax, %4) \n\t" - "vmovups %%zmm15, 0x40(%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm16, (%%rax) \n\t" - "vmovups %%zmm17, 0x40(%%rax) \n\t" - "vmovups %%zmm18, (%%rax, %4) \n\t" - "vmovups %%zmm19, 0x40(%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm20, (%%rax) \n\t" - "vmovups %%zmm21, 0x40(%%rax) \n\t" - "vmovups %%zmm22, (%%rax, %4) \n\t" - "vmovups %%zmm23, 0x40(%%rax, %4) \n\t" - - ".align 16 \n\t" - "4: \n\t" - : - : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((long long)(N * 4)), - "r"(scale), "r"((int64_t)stepK), "r"(offsetC), "r"((int64_t)flags), "r"(u8Result) - : "%rax", "%rbx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", - "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", - "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", "%zmm24", "%zmm25", - "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31", "memory", "cc"); -} - -#ifdef _USE_AVX512_VNNI -#define mmmKernel24x16 \ - "movq %0, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "prefetcht0 0x80(%1) \n\t" \ - "vpdpbusd %%zmm24, %%zmm25, %%zmm0 \n\t" \ - "vpdpbusd %%zmm24, %%zmm26, %%zmm1 \n\t" \ - "vpdpbusd %%zmm24, %%zmm27, %%zmm2 \n\t" \ - "vpdpbusd %%zmm24, %%zmm28, %%zmm3 \n\t" \ - "vpdpbusd %%zmm24, %%zmm29, %%zmm4 \n\t" \ - "vpdpbusd %%zmm24, %%zmm30, %%zmm5 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpdpbusd %%zmm24, %%zmm25, %%zmm6 \n\t" \ - "vpdpbusd %%zmm24, %%zmm26, %%zmm7 \n\t" \ - "vpdpbusd %%zmm24, %%zmm27, %%zmm8 \n\t" \ - "vpdpbusd %%zmm24, %%zmm28, %%zmm9 \n\t" \ - "vpdpbusd %%zmm24, %%zmm29, %%zmm10 \n\t" \ - "vpdpbusd %%zmm24, %%zmm30, %%zmm11 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vmovups (%1), %%zmm31 \n\t" \ - "vpdpbusd %%zmm24, %%zmm25, %%zmm12 \n\t" \ - "vpdpbusd %%zmm24, %%zmm26, %%zmm13 \n\t" \ - "vpdpbusd %%zmm24, %%zmm27, %%zmm14 \n\t" \ - "vpdpbusd %%zmm24, %%zmm28, %%zmm15 \n\t" \ - "vpdpbusd %%zmm24, %%zmm29, %%zmm16 \n\t" \ - "vpdpbusd %%zmm24, %%zmm30, %%zmm17 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpdpbusd %%zmm24, %%zmm25, %%zmm18 \n\t" \ - "vpdpbusd %%zmm24, %%zmm26, %%zmm19 \n\t" \ - "vpdpbusd %%zmm24, %%zmm27, %%zmm20 \n\t" \ - "vpdpbusd %%zmm24, %%zmm28, %%zmm21 \n\t" \ - "vpdpbusd %%zmm24, %%zmm29, %%zmm22 \n\t" \ - "vpdpbusd %%zmm24, %%zmm30, %%zmm23 \n\t" \ - "movq %0, %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "prefetcht0 0xC0(%1) \n\t" \ - "vpdpbusd %%zmm31, %%zmm25, %%zmm0 \n\t" \ - "vpdpbusd %%zmm31, %%zmm26, %%zmm1 \n\t" \ - "vpdpbusd %%zmm31, %%zmm27, %%zmm2 \n\t" \ - "vpdpbusd %%zmm31, %%zmm28, %%zmm3 \n\t" \ - "vpdpbusd %%zmm31, %%zmm29, %%zmm4 \n\t" \ - "vpdpbusd %%zmm31, %%zmm30, %%zmm5 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpdpbusd %%zmm31, %%zmm25, %%zmm6 \n\t" \ - "vpdpbusd %%zmm31, %%zmm26, %%zmm7 \n\t" \ - "vpdpbusd %%zmm31, %%zmm27, %%zmm8 \n\t" \ - "vpdpbusd %%zmm31, %%zmm28, %%zmm9 \n\t" \ - "vpdpbusd %%zmm31, %%zmm29, %%zmm10 \n\t" \ - "vpdpbusd %%zmm31, %%zmm30, %%zmm11 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vmovups 0x40(%1), %%zmm24 \n\t" \ - "vpdpbusd %%zmm31, %%zmm25, %%zmm12 \n\t" \ - "vpdpbusd %%zmm31, %%zmm26, %%zmm13 \n\t" \ - "vpdpbusd %%zmm31, %%zmm27, %%zmm14 \n\t" \ - "vpdpbusd %%zmm31, %%zmm28, %%zmm15 \n\t" \ - "vpdpbusd %%zmm31, %%zmm29, %%zmm16 \n\t" \ - "vpdpbusd %%zmm31, %%zmm30, %%zmm17 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpdpbusd %%zmm31, %%zmm25, %%zmm18 \n\t" \ - "vpdpbusd %%zmm31, %%zmm26, %%zmm19 \n\t" \ - "vpdpbusd %%zmm31, %%zmm27, %%zmm20 \n\t" \ - "vpdpbusd %%zmm31, %%zmm28, %%zmm21 \n\t" \ - "vpdpbusd %%zmm31, %%zmm29, %%zmm22 \n\t" \ - "vpdpbusd %%zmm31, %%zmm30, %%zmm23 \n\t" -#else -#define mmmKernel24x16 \ - "movq %0, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "prefetcht0 0x80(%1) \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm0, %%zmm28, %%zmm0 \n\t" \ - "vpaddd %%zmm1, %%zmm29, %%zmm1 \n\t" \ - "vpaddd %%zmm2, %%zmm30, %%zmm2 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm3, %%zmm28, %%zmm3 \n\t" \ - "vpaddd %%zmm4, %%zmm29, %%zmm4 \n\t" \ - "vpaddd %%zmm5, %%zmm30, %%zmm5 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm6, %%zmm28, %%zmm6 \n\t" \ - "vpaddd %%zmm7, %%zmm29, %%zmm7 \n\t" \ - "vpaddd %%zmm8, %%zmm30, %%zmm8 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm9, %%zmm28, %%zmm9 \n\t" \ - "vpaddd %%zmm10, %%zmm29, %%zmm10 \n\t" \ - "vpaddd %%zmm11, %%zmm30, %%zmm11 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm12, %%zmm28, %%zmm12 \n\t" \ - "vpaddd %%zmm13, %%zmm29, %%zmm13 \n\t" \ - "vpaddd %%zmm14, %%zmm30, %%zmm14 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm15, %%zmm28, %%zmm15 \n\t" \ - "vpaddd %%zmm16, %%zmm29, %%zmm16 \n\t" \ - "vpaddd %%zmm17, %%zmm30, %%zmm17 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm18, %%zmm28, %%zmm18 \n\t" \ - "vpaddd %%zmm19, %%zmm29, %%zmm19 \n\t" \ - "vpaddd %%zmm20, %%zmm30, %%zmm20 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "movq %0, %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vmovups (%1), %%zmm24 \n\t" \ - "vpaddd %%zmm21, %%zmm28, %%zmm21 \n\t" \ - "vpaddd %%zmm22, %%zmm29, %%zmm22 \n\t" \ - "vpaddd %%zmm23, %%zmm30, %%zmm23 \n\t" \ - "prefetcht0 0xC0(%1) \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm0, %%zmm28, %%zmm0 \n\t" \ - "vpaddd %%zmm1, %%zmm29, %%zmm1 \n\t" \ - "vpaddd %%zmm2, %%zmm30, %%zmm2 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm3, %%zmm28, %%zmm3 \n\t" \ - "vpaddd %%zmm4, %%zmm29, %%zmm4 \n\t" \ - "vpaddd %%zmm5, %%zmm30, %%zmm5 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm6, %%zmm28, %%zmm6 \n\t" \ - "vpaddd %%zmm7, %%zmm29, %%zmm7 \n\t" \ - "vpaddd %%zmm8, %%zmm30, %%zmm8 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm9, %%zmm28, %%zmm9 \n\t" \ - "vpaddd %%zmm10, %%zmm29, %%zmm10 \n\t" \ - "vpaddd %%zmm11, %%zmm30, %%zmm11 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm12, %%zmm28, %%zmm12 \n\t" \ - "vpaddd %%zmm13, %%zmm29, %%zmm13 \n\t" \ - "vpaddd %%zmm14, %%zmm30, %%zmm14 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm15, %%zmm28, %%zmm15 \n\t" \ - "vpaddd %%zmm16, %%zmm29, %%zmm16 \n\t" \ - "vpaddd %%zmm17, %%zmm30, %%zmm17 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm18, %%zmm28, %%zmm18 \n\t" \ - "vpaddd %%zmm19, %%zmm29, %%zmm19 \n\t" \ - "vpaddd %%zmm20, %%zmm30, %%zmm20 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "vmovups 0x40(%1), %%zmm24 \n\t" \ - "vpaddd %%zmm21, %%zmm28, %%zmm21 \n\t" \ - "vpaddd %%zmm22, %%zmm29, %%zmm22 \n\t" \ - "vpaddd %%zmm23, %%zmm30, %%zmm23 \n\t" -#endif - -inline void mmm_avx512_24x16_asm(U32 um, - U32 un, - U32 bk, - UINT8 *matrixA, - INT8 *matrixB, - I32 *matrixC, - UINT8 *u8Result, - I32 *offsetC, - U32 N, - U32 stepK, - const F32 *scale, - U32 flags) -{ - __asm__ __volatile__( - "prefetcht0 0x80(%1) \n\t" - "vmovups (%1), %%zmm24 \n\t" - "add $0x40, %1 \n\t" -#ifndef _USE_AVX512_VNNI - "mov $1, %%ebx \n\t" - "vmovd %%ebx, %%xmm0 \n\t" - "vpbroadcastw %%xmm0, %%zmm31 \n\t" -#endif - "movq %8, %%rbx \n\t" - "andq $0x1, %%rbx \n\t" - "jne 0f \n\t" - "vmovups (%7), %%zmm0 \n\t" - "vmovups %%zmm0, %%zmm1 \n\t" - "vmovups %%zmm0, %%zmm2 \n\t" - "vmovups %%zmm0, %%zmm3 \n\t" - "vmovups %%zmm0, %%zmm4 \n\t" - "vmovups %%zmm0, %%zmm5 \n\t" - "vmovups %%zmm0, %%zmm6 \n\t" - "vmovups %%zmm0, %%zmm7 \n\t" - "vmovups %%zmm0, %%zmm8 \n\t" - "vmovups %%zmm0, %%zmm9 \n\t" - "vmovups %%zmm0, %%zmm10 \n\t" - "vmovups %%zmm0, %%zmm11 \n\t" - "vmovups %%zmm0, %%zmm12 \n\t" - "vmovups %%zmm0, %%zmm13 \n\t" - "vmovups %%zmm0, %%zmm14 \n\t" - "vmovups %%zmm0, %%zmm15 \n\t" - "vmovups %%zmm0, %%zmm16 \n\t" - "vmovups %%zmm0, %%zmm17 \n\t" - "vmovups %%zmm0, %%zmm18 \n\t" - "vmovups %%zmm0, %%zmm19 \n\t" - "vmovups %%zmm0, %%zmm20 \n\t" - "vmovups %%zmm0, %%zmm21 \n\t" - "vmovups %%zmm0, %%zmm22 \n\t" - "vmovups %%zmm0, %%zmm23 \n\t" - "jmp 1f \n\t" - ".align 16 \n\t" - "0: \n\t" - "vxorps %%zmm0, %%zmm0, %%zmm0 \n\t" - "vxorps %%zmm1, %%zmm1, %%zmm1 \n\t" - "vxorps %%zmm2, %%zmm2, %%zmm2 \n\t" - "vxorps %%zmm3, %%zmm3, %%zmm3 \n\t" - "vxorps %%zmm4, %%zmm4, %%zmm4 \n\t" - "vxorps %%zmm5, %%zmm5, %%zmm5 \n\t" - "vxorps %%zmm6, %%zmm6, %%zmm6 \n\t" - "vxorps %%zmm7, %%zmm7, %%zmm7 \n\t" - "vxorps %%zmm8, %%zmm8, %%zmm8 \n\t" - "vxorps %%zmm9, %%zmm9, %%zmm9 \n\t" - "vxorps %%zmm10, %%zmm10, %%zmm10 \n\t" - "vxorps %%zmm11, %%zmm11, %%zmm11 \n\t" - "vxorps %%zmm12, %%zmm12, %%zmm12 \n\t" - "vxorps %%zmm13, %%zmm13, %%zmm13 \n\t" - "vxorps %%zmm14, %%zmm14, %%zmm14 \n\t" - "vxorps %%zmm15, %%zmm15, %%zmm15 \n\t" - "vxorps %%zmm16, %%zmm16, %%zmm16 \n\t" - "vxorps %%zmm17, %%zmm17, %%zmm17 \n\t" - "vxorps %%zmm18, %%zmm18, %%zmm18 \n\t" - "vxorps %%zmm19, %%zmm19, %%zmm19 \n\t" - "vxorps %%zmm20, %%zmm20, %%zmm20 \n\t" - "vxorps %%zmm21, %%zmm21, %%zmm21 \n\t" - "vxorps %%zmm22, %%zmm22, %%zmm22 \n\t" - "vxorps %%zmm23, %%zmm23, %%zmm23 \n\t" - ".align 16 \n\t" - "1: \n\t" - "movq %2, %%rax \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "movq %6, %%rbx \n\t" - "addq %6, %%rbx \n\t" - "addq %6, %%rbx \n\t" - - ".align 16 \n\t" - "2: \n\t" mmmKernel24x16 - - "add $0x80, %1 \n\t" - "add $0x8, %0 \n\t" - "dec %%rcx \n\t" - "jg 2b \n\t" - - "movq %2, %%rax \n\t" - "movq %4, %%rcx \n\t" - "addq %4, %%rcx \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd (%%rax, %4), %%zmm1, %%zmm1 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm2, %%zmm2 \n\t" - "vpaddd (%%rax, %4), %%zmm3, %%zmm3 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm4, %%zmm4 \n\t" - "vpaddd (%%rax, %4), %%zmm5, %%zmm5 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm6, %%zmm6 \n\t" - "vpaddd (%%rax, %4), %%zmm7, %%zmm7 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm8, %%zmm8 \n\t" - "vpaddd (%%rax, %4), %%zmm9, %%zmm9 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm10, %%zmm10 \n\t" - "vpaddd (%%rax, %4), %%zmm11, %%zmm11 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm12, %%zmm12 \n\t" - "vpaddd (%%rax, %4), %%zmm13, %%zmm13 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm14, %%zmm14 \n\t" - "vpaddd (%%rax, %4), %%zmm15, %%zmm15 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm16, %%zmm16 \n\t" - "vpaddd (%%rax, %4), %%zmm17, %%zmm17 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm18, %%zmm18 \n\t" - "vpaddd (%%rax, %4), %%zmm19, %%zmm19 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm20, %%zmm20 \n\t" - "vpaddd (%%rax, %4), %%zmm21, %%zmm21 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm22, %%zmm22 \n\t" - "vpaddd (%%rax, %4), %%zmm23, %%zmm23 \n\t" - - "cmpq $0x0, %5 \n\t" - "je 3f \n\t" - - "vbroadcastss (%5), %%zmm24 \n\t" - "vcvtdq2ps %%zmm0, %%zmm0 \n\t" - "vcvtdq2ps %%zmm1, %%zmm1 \n\t" - "vcvtdq2ps %%zmm2, %%zmm2 \n\t" - "vcvtdq2ps %%zmm3, %%zmm3 \n\t" - "vcvtdq2ps %%zmm4, %%zmm4 \n\t" - "vcvtdq2ps %%zmm5, %%zmm5 \n\t" - "vcvtdq2ps %%zmm6, %%zmm6 \n\t" - "vcvtdq2ps %%zmm7, %%zmm7 \n\t" - "vcvtdq2ps %%zmm8, %%zmm8 \n\t" - "vcvtdq2ps %%zmm9, %%zmm9 \n\t" - "vcvtdq2ps %%zmm10, %%zmm10 \n\t" - "vcvtdq2ps %%zmm11, %%zmm11 \n\t" - "vcvtdq2ps %%zmm12, %%zmm12 \n\t" - "vcvtdq2ps %%zmm13, %%zmm13 \n\t" - "vcvtdq2ps %%zmm14, %%zmm14 \n\t" - "vcvtdq2ps %%zmm15, %%zmm15 \n\t" - "vcvtdq2ps %%zmm16, %%zmm16 \n\t" - "vcvtdq2ps %%zmm17, %%zmm17 \n\t" - "vcvtdq2ps %%zmm18, %%zmm18 \n\t" - "vcvtdq2ps %%zmm19, %%zmm19 \n\t" - "vcvtdq2ps %%zmm20, %%zmm20 \n\t" - "vcvtdq2ps %%zmm21, %%zmm21 \n\t" - "vcvtdq2ps %%zmm22, %%zmm22 \n\t" - "vcvtdq2ps %%zmm23, %%zmm23 \n\t" - "vmulps %%zmm0, %%zmm24, %%zmm0 \n\t" - "vmulps %%zmm1, %%zmm24, %%zmm1 \n\t" - "vmulps %%zmm2, %%zmm24, %%zmm2 \n\t" - "vmulps %%zmm3, %%zmm24, %%zmm3 \n\t" - "vmulps %%zmm4, %%zmm24, %%zmm4 \n\t" - "vmulps %%zmm5, %%zmm24, %%zmm5 \n\t" - "vmulps %%zmm6, %%zmm24, %%zmm6 \n\t" - "vmulps %%zmm7, %%zmm24, %%zmm7 \n\t" - "vmulps %%zmm8, %%zmm24, %%zmm8 \n\t" - "vmulps %%zmm9, %%zmm24, %%zmm9 \n\t" - "vmulps %%zmm10, %%zmm24, %%zmm10 \n\t" - "vmulps %%zmm11, %%zmm24, %%zmm11 \n\t" - "vmulps %%zmm12, %%zmm24, %%zmm12 \n\t" - "vmulps %%zmm13, %%zmm24, %%zmm13 \n\t" - "vmulps %%zmm14, %%zmm24, %%zmm14 \n\t" - "vmulps %%zmm15, %%zmm24, %%zmm15 \n\t" - "vmulps %%zmm16, %%zmm24, %%zmm16 \n\t" - "vmulps %%zmm17, %%zmm24, %%zmm17 \n\t" - "vmulps %%zmm18, %%zmm24, %%zmm18 \n\t" - "vmulps %%zmm19, %%zmm24, %%zmm19 \n\t" - "vmulps %%zmm20, %%zmm24, %%zmm20 \n\t" - "vmulps %%zmm21, %%zmm24, %%zmm21 \n\t" - "vmulps %%zmm22, %%zmm24, %%zmm22 \n\t" - "vmulps %%zmm23, %%zmm24, %%zmm23 \n\t" - - "movq %8, %%rbx \n\t" - "andq $0x2, %%rbx \n\t" - "je 3f \n\t" - "vcvtps2dq %%zmm0, %%zmm0 \n\t" - "vcvtps2dq %%zmm1, %%zmm1 \n\t" - "vcvtps2dq %%zmm2, %%zmm2 \n\t" - "vcvtps2dq %%zmm3, %%zmm3 \n\t" - "vcvtps2dq %%zmm4, %%zmm4 \n\t" - "vcvtps2dq %%zmm5, %%zmm5 \n\t" - "vcvtps2dq %%zmm6, %%zmm6 \n\t" - "vcvtps2dq %%zmm7, %%zmm7 \n\t" - "vcvtps2dq %%zmm8, %%zmm8 \n\t" - "vcvtps2dq %%zmm9, %%zmm9 \n\t" - "vcvtps2dq %%zmm10, %%zmm10 \n\t" - "vcvtps2dq %%zmm11, %%zmm11 \n\t" - "vcvtps2dq %%zmm12, %%zmm12 \n\t" - "vcvtps2dq %%zmm13, %%zmm13 \n\t" - "vcvtps2dq %%zmm14, %%zmm14 \n\t" - "vcvtps2dq %%zmm15, %%zmm15 \n\t" - "vcvtps2dq %%zmm16, %%zmm16 \n\t" - "vcvtps2dq %%zmm17, %%zmm17 \n\t" - "vcvtps2dq %%zmm18, %%zmm18 \n\t" - "vcvtps2dq %%zmm19, %%zmm19 \n\t" - "vcvtps2dq %%zmm20, %%zmm20 \n\t" - "vcvtps2dq %%zmm21, %%zmm21 \n\t" - "vcvtps2dq %%zmm22, %%zmm22 \n\t" - "vcvtps2dq %%zmm23, %%zmm23 \n\t" - "mov $128, %%eax \n\t" - "vmovd %%eax, %%xmm25 \n\t" - "vbroadcastss %%xmm25, %%zmm24 \n\t" - "vpaddd %%zmm0, %%zmm24, %%zmm0 \n\t" - "vpaddd %%zmm1, %%zmm24, %%zmm1 \n\t" - "vpaddd %%zmm2, %%zmm24, %%zmm2 \n\t" - "vpaddd %%zmm3, %%zmm24, %%zmm3 \n\t" - "vpaddd %%zmm4, %%zmm24, %%zmm4 \n\t" - "vpaddd %%zmm5, %%zmm24, %%zmm5 \n\t" - "vpaddd %%zmm6, %%zmm24, %%zmm6 \n\t" - "vpaddd %%zmm7, %%zmm24, %%zmm7 \n\t" - "vpaddd %%zmm8, %%zmm24, %%zmm8 \n\t" - "vpaddd %%zmm9, %%zmm24, %%zmm9 \n\t" - "vpaddd %%zmm10, %%zmm24, %%zmm10 \n\t" - "vpaddd %%zmm11, %%zmm24, %%zmm11 \n\t" - "vpaddd %%zmm12, %%zmm24, %%zmm12 \n\t" - "vpaddd %%zmm13, %%zmm24, %%zmm13 \n\t" - "vpaddd %%zmm14, %%zmm24, %%zmm14 \n\t" - "vpaddd %%zmm15, %%zmm24, %%zmm15 \n\t" - "vpaddd %%zmm16, %%zmm24, %%zmm16 \n\t" - "vpaddd %%zmm17, %%zmm24, %%zmm17 \n\t" - "vpaddd %%zmm18, %%zmm24, %%zmm18 \n\t" - "vpaddd %%zmm19, %%zmm24, %%zmm19 \n\t" - "vpaddd %%zmm20, %%zmm24, %%zmm20 \n\t" - "vpaddd %%zmm21, %%zmm24, %%zmm21 \n\t" - "vpaddd %%zmm22, %%zmm24, %%zmm22 \n\t" - "vpaddd %%zmm23, %%zmm24, %%zmm23 \n\t" - "movq %9, %%rax \n\t" - "shr $2, %4 \n\t" - "movq %4, %%rcx \n\t" - "addq %4, %%rcx \n\t" - "vpmovusdb %%zmm0, (%%rax) \n\t" - "vpmovusdb %%zmm1, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm2, (%%rax) \n\t" - "vpmovusdb %%zmm3, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm4, (%%rax) \n\t" - "vpmovusdb %%zmm5, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm6, (%%rax) \n\t" - "vpmovusdb %%zmm7, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm8, (%%rax) \n\t" - "vpmovusdb %%zmm9, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm10, (%%rax) \n\t" - "vpmovusdb %%zmm11, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm12, (%%rax) \n\t" - "vpmovusdb %%zmm13, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm14, (%%rax) \n\t" - "vpmovusdb %%zmm15, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm16, (%%rax) \n\t" - "vpmovusdb %%zmm17, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm18, (%%rax) \n\t" - "vpmovusdb %%zmm19, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm20, (%%rax) \n\t" - "vpmovusdb %%zmm21, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm22, (%%rax) \n\t" - "vpmovusdb %%zmm23, (%%rax, %4) \n\t" - "jmp 4f \n\t" - - ".align 16 \n\t" - "3: \n\t" - "movq %2, %%rax \n\t" - "vmovups %%zmm0, (%%rax) \n\t" - "vmovups %%zmm1, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm2, (%%rax) \n\t" - "vmovups %%zmm3, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm4, (%%rax) \n\t" - "vmovups %%zmm5, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm6, (%%rax) \n\t" - "vmovups %%zmm7, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm8, (%%rax) \n\t" - "vmovups %%zmm9, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm10, (%%rax) \n\t" - "vmovups %%zmm11, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm12, (%%rax) \n\t" - "vmovups %%zmm13, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm14, (%%rax) \n\t" - "vmovups %%zmm15, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm16, (%%rax) \n\t" - "vmovups %%zmm17, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm18, (%%rax) \n\t" - "vmovups %%zmm19, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm20, (%%rax) \n\t" - "vmovups %%zmm21, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm22, (%%rax) \n\t" - "vmovups %%zmm23, (%%rax, %4) \n\t" - - ".align 16 \n\t" - "4: \n\t" - : - : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((int64_t)(N * 4)), - "r"(scale), "r"((int64_t)stepK), "r"(offsetC), "r"((int64_t)flags), "r"(u8Result) - : "%rax", "%rbx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", - "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", - "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", "%zmm24", "%zmm25", - "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31", "memory", "cc"); -} - -#ifdef _USE_AVX512_VNNI -#define mmmKernel24x8 \ - "movq %0, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm30 \n\t" \ - "prefetcht0 0x80(%1) \n\t" \ - "vpdpbusd %%ymm24, %%ymm25, %%ymm0 \n\t" \ - "vpdpbusd %%ymm24, %%ymm26, %%ymm1 \n\t" \ - "vpdpbusd %%ymm24, %%ymm27, %%ymm2 \n\t" \ - "vpdpbusd %%ymm24, %%ymm28, %%ymm3 \n\t" \ - "vpdpbusd %%ymm24, %%ymm29, %%ymm4 \n\t" \ - "vpdpbusd %%ymm24, %%ymm30, %%ymm5 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm30 \n\t" \ - "vpdpbusd %%ymm24, %%ymm25, %%ymm6 \n\t" \ - "vpdpbusd %%ymm24, %%ymm26, %%ymm7 \n\t" \ - "vpdpbusd %%ymm24, %%ymm27, %%ymm8 \n\t" \ - "vpdpbusd %%ymm24, %%ymm28, %%ymm9 \n\t" \ - "vpdpbusd %%ymm24, %%ymm29, %%ymm10 \n\t" \ - "vpdpbusd %%ymm24, %%ymm30, %%ymm11 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm30 \n\t" \ - "vmovups (%1), %%ymm31 \n\t" \ - "vpdpbusd %%ymm24, %%ymm25, %%ymm12 \n\t" \ - "vpdpbusd %%ymm24, %%ymm26, %%ymm13 \n\t" \ - "vpdpbusd %%ymm24, %%ymm27, %%ymm14 \n\t" \ - "vpdpbusd %%ymm24, %%ymm28, %%ymm15 \n\t" \ - "vpdpbusd %%ymm24, %%ymm29, %%ymm16 \n\t" \ - "vpdpbusd %%ymm24, %%ymm30, %%ymm17 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm30 \n\t" \ - "vpdpbusd %%ymm24, %%ymm25, %%ymm18 \n\t" \ - "vpdpbusd %%ymm24, %%ymm26, %%ymm19 \n\t" \ - "vpdpbusd %%ymm24, %%ymm27, %%ymm20 \n\t" \ - "vpdpbusd %%ymm24, %%ymm28, %%ymm21 \n\t" \ - "vpdpbusd %%ymm24, %%ymm29, %%ymm22 \n\t" \ - "vpdpbusd %%ymm24, %%ymm30, %%ymm23 \n\t" \ - "movq %0, %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm30 \n\t" \ - "vpdpbusd %%ymm31, %%ymm25, %%ymm0 \n\t" \ - "vpdpbusd %%ymm31, %%ymm26, %%ymm1 \n\t" \ - "vpdpbusd %%ymm31, %%ymm27, %%ymm2 \n\t" \ - "vpdpbusd %%ymm31, %%ymm28, %%ymm3 \n\t" \ - "vpdpbusd %%ymm31, %%ymm29, %%ymm4 \n\t" \ - "vpdpbusd %%ymm31, %%ymm30, %%ymm5 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm30 \n\t" \ - "vpdpbusd %%ymm31, %%ymm25, %%ymm6 \n\t" \ - "vpdpbusd %%ymm31, %%ymm26, %%ymm7 \n\t" \ - "vpdpbusd %%ymm31, %%ymm27, %%ymm8 \n\t" \ - "vpdpbusd %%ymm31, %%ymm28, %%ymm9 \n\t" \ - "vpdpbusd %%ymm31, %%ymm29, %%ymm10 \n\t" \ - "vpdpbusd %%ymm31, %%ymm30, %%ymm11 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm30 \n\t" \ - "vmovups 0x20(%1), %%ymm24 \n\t" \ - "vpdpbusd %%ymm31, %%ymm25, %%ymm12 \n\t" \ - "vpdpbusd %%ymm31, %%ymm26, %%ymm13 \n\t" \ - "vpdpbusd %%ymm31, %%ymm27, %%ymm14 \n\t" \ - "vpdpbusd %%ymm31, %%ymm28, %%ymm15 \n\t" \ - "vpdpbusd %%ymm31, %%ymm29, %%ymm16 \n\t" \ - "vpdpbusd %%ymm31, %%ymm30, %%ymm17 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm30 \n\t" \ - "vpdpbusd %%ymm31, %%ymm25, %%ymm18 \n\t" \ - "vpdpbusd %%ymm31, %%ymm26, %%ymm19 \n\t" \ - "vpdpbusd %%ymm31, %%ymm27, %%ymm20 \n\t" \ - "vpdpbusd %%ymm31, %%ymm28, %%ymm21 \n\t" \ - "vpdpbusd %%ymm31, %%ymm29, %%ymm22 \n\t" \ - "vpdpbusd %%ymm31, %%ymm30, %%ymm23 \n\t" -#else -#define mmmKernel24x8 \ - "movq %0, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "prefetcht0 0x80(%1) \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm0, %%ymm28, %%ymm0 \n\t" \ - "vpaddd %%ymm1, %%ymm29, %%ymm1 \n\t" \ - "vpaddd %%ymm2, %%ymm30, %%ymm2 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm3, %%ymm28, %%ymm3 \n\t" \ - "vpaddd %%ymm4, %%ymm29, %%ymm4 \n\t" \ - "vpaddd %%ymm5, %%ymm30, %%ymm5 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm6, %%ymm28, %%ymm6 \n\t" \ - "vpaddd %%ymm7, %%ymm29, %%ymm7 \n\t" \ - "vpaddd %%ymm8, %%ymm30, %%ymm8 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm9, %%ymm28, %%ymm9 \n\t" \ - "vpaddd %%ymm10, %%ymm29, %%ymm10 \n\t" \ - "vpaddd %%ymm11, %%ymm30, %%ymm11 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm12, %%ymm28, %%ymm12 \n\t" \ - "vpaddd %%ymm13, %%ymm29, %%ymm13 \n\t" \ - "vpaddd %%ymm14, %%ymm30, %%ymm14 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm15, %%ymm28, %%ymm15 \n\t" \ - "vpaddd %%ymm16, %%ymm29, %%ymm16 \n\t" \ - "vpaddd %%ymm17, %%ymm30, %%ymm17 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm18, %%ymm28, %%ymm18 \n\t" \ - "vpaddd %%ymm19, %%ymm29, %%ymm19 \n\t" \ - "vpaddd %%ymm20, %%ymm30, %%ymm20 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "movq %0, %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vmovups (%1), %%ymm24 \n\t" \ - "vpaddd %%ymm21, %%ymm28, %%ymm21 \n\t" \ - "vpaddd %%ymm22, %%ymm29, %%ymm22 \n\t" \ - "vpaddd %%ymm23, %%ymm30, %%ymm23 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm0, %%ymm28, %%ymm0 \n\t" \ - "vpaddd %%ymm1, %%ymm29, %%ymm1 \n\t" \ - "vpaddd %%ymm2, %%ymm30, %%ymm2 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm3, %%ymm28, %%ymm3 \n\t" \ - "vpaddd %%ymm4, %%ymm29, %%ymm4 \n\t" \ - "vpaddd %%ymm5, %%ymm30, %%ymm5 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm6, %%ymm28, %%ymm6 \n\t" \ - "vpaddd %%ymm7, %%ymm29, %%ymm7 \n\t" \ - "vpaddd %%ymm8, %%ymm30, %%ymm8 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm9, %%ymm28, %%ymm9 \n\t" \ - "vpaddd %%ymm10, %%ymm29, %%ymm10 \n\t" \ - "vpaddd %%ymm11, %%ymm30, %%ymm11 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm12, %%ymm28, %%ymm12 \n\t" \ - "vpaddd %%ymm13, %%ymm29, %%ymm13 \n\t" \ - "vpaddd %%ymm14, %%ymm30, %%ymm14 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm15, %%ymm28, %%ymm15 \n\t" \ - "vpaddd %%ymm16, %%ymm29, %%ymm16 \n\t" \ - "vpaddd %%ymm17, %%ymm30, %%ymm17 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm18, %%ymm28, %%ymm18 \n\t" \ - "vpaddd %%ymm19, %%ymm29, %%ymm19 \n\t" \ - "vpaddd %%ymm20, %%ymm30, %%ymm20 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "vmovups 0x20(%1), %%ymm24 \n\t" \ - "vpaddd %%ymm21, %%ymm28, %%ymm21 \n\t" \ - "vpaddd %%ymm22, %%ymm29, %%ymm22 \n\t" \ - "vpaddd %%ymm23, %%ymm30, %%ymm23 \n\t" -#endif - -inline void mmm_avx512_24x8_asm(U32 um, - U32 un, - U32 bk, - UINT8 *matrixA, - INT8 *matrixB, - I32 *matrixC, - UINT8 *u8Result, - I32 *offsetC, - U32 N, - U32 stepK, - const F32 *scale, - U32 flags) -{ - __asm__ __volatile__( - "prefetcht0 0x40(%1) \n\t" - "vmovups (%1), %%ymm24 \n\t" - "add $0x20, %1 \n\t" -#ifndef _USE_AVX512_VNNI - "mov $1, %%ebx \n\t" - "vmovd %%ebx, %%xmm0 \n\t" - "vpbroadcastw %%xmm0, %%ymm31 \n\t" -#endif - "movq %8, %%rbx \n\t" - "andq $0x1, %%rbx \n\t" - "jne 0f \n\t" - "vmovups (%7), %%ymm0 \n\t" - "vmovups %%ymm0, %%ymm1 \n\t" - "vmovups %%ymm0, %%ymm2 \n\t" - "vmovups %%ymm0, %%ymm3 \n\t" - "vmovups %%ymm0, %%ymm4 \n\t" - "vmovups %%ymm0, %%ymm5 \n\t" - "vmovups %%ymm0, %%ymm6 \n\t" - "vmovups %%ymm0, %%ymm7 \n\t" - "vmovups %%ymm0, %%ymm8 \n\t" - "vmovups %%ymm0, %%ymm9 \n\t" - "vmovups %%ymm0, %%ymm10 \n\t" - "vmovups %%ymm0, %%ymm11 \n\t" - "vmovups %%ymm0, %%ymm12 \n\t" - "vmovups %%ymm0, %%ymm13 \n\t" - "vmovups %%ymm0, %%ymm14 \n\t" - "vmovups %%ymm0, %%ymm15 \n\t" - "vmovups %%ymm0, %%ymm16 \n\t" - "vmovups %%ymm0, %%ymm17 \n\t" - "vmovups %%ymm0, %%ymm18 \n\t" - "vmovups %%ymm0, %%ymm19 \n\t" - "vmovups %%ymm0, %%ymm20 \n\t" - "vmovups %%ymm0, %%ymm21 \n\t" - "vmovups %%ymm0, %%ymm22 \n\t" - "vmovups %%ymm0, %%ymm23 \n\t" - "jmp 1f \n\t" - ".align 16 \n\t" - "0: \n\t" - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" - "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" - "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" - "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" - "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" - "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" - "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" - "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" - "vxorps %%ymm8, %%ymm8, %%ymm8 \n\t" - "vxorps %%ymm9, %%ymm9, %%ymm9 \n\t" - "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" - "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" - "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" - "vxorps %%ymm13, %%ymm13, %%ymm13 \n\t" - "vxorps %%ymm14, %%ymm14, %%ymm14 \n\t" - "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" - "vxorps %%ymm16, %%ymm16, %%ymm16 \n\t" - "vxorps %%ymm17, %%ymm17, %%ymm17 \n\t" - "vxorps %%ymm18, %%ymm18, %%ymm18 \n\t" - "vxorps %%ymm19, %%ymm19, %%ymm19 \n\t" - "vxorps %%ymm20, %%ymm20, %%ymm20 \n\t" - "vxorps %%ymm21, %%ymm21, %%ymm21 \n\t" - "vxorps %%ymm22, %%ymm22, %%ymm22 \n\t" - "vxorps %%ymm23, %%ymm23, %%ymm23 \n\t" - ".align 16 \n\t" - "1: \n\t" - "movq %2, %%rax \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "movq %6, %%rbx \n\t" - "addq %6, %%rbx \n\t" - "addq %6, %%rbx \n\t" - - ".align 16 \n\t" - "2: \n\t" mmmKernel24x8 - - "add $0x40, %1 \n\t" - "add $0x8, %0 \n\t" - "dec %%rcx \n\t" - "jg 2b \n\t" - - "movq %2, %%rax \n\t" - "movq %4, %%rcx \n\t" - "addq %4, %%rcx \n\t" - "vpaddd (%%rax), %%ymm0, %%ymm0 \n\t" - "vpaddd (%%rax, %4), %%ymm1, %%ymm1 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%ymm2, %%ymm2 \n\t" - "vpaddd (%%rax, %4), %%ymm3, %%ymm3 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%ymm4, %%ymm4 \n\t" - "vpaddd (%%rax, %4), %%ymm5, %%ymm5 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%ymm6, %%ymm6 \n\t" - "vpaddd (%%rax, %4), %%ymm7, %%ymm7 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%ymm8, %%ymm8 \n\t" - "vpaddd (%%rax, %4), %%ymm9, %%ymm9 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%ymm10, %%ymm10 \n\t" - "vpaddd (%%rax, %4), %%ymm11, %%ymm11 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%ymm12, %%ymm12 \n\t" - "vpaddd (%%rax, %4), %%ymm13, %%ymm13 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%ymm14, %%ymm14 \n\t" - "vpaddd (%%rax, %4), %%ymm15, %%ymm15 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%ymm16, %%ymm16 \n\t" - "vpaddd (%%rax, %4), %%ymm17, %%ymm17 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%ymm18, %%ymm18 \n\t" - "vpaddd (%%rax, %4), %%ymm19, %%ymm19 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%ymm20, %%ymm20 \n\t" - "vpaddd (%%rax, %4), %%ymm21, %%ymm21 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%ymm22, %%ymm22 \n\t" - "vpaddd (%%rax, %4), %%ymm23, %%ymm23 \n\t" - - "cmpq $0x0, %5 \n\t" - "je 3f \n\t" - - "vbroadcastss (%5), %%ymm24 \n\t" - "vcvtdq2ps %%ymm0, %%ymm0 \n\t" - "vcvtdq2ps %%ymm1, %%ymm1 \n\t" - "vcvtdq2ps %%ymm2, %%ymm2 \n\t" - "vcvtdq2ps %%ymm3, %%ymm3 \n\t" - "vcvtdq2ps %%ymm4, %%ymm4 \n\t" - "vcvtdq2ps %%ymm5, %%ymm5 \n\t" - "vcvtdq2ps %%ymm6, %%ymm6 \n\t" - "vcvtdq2ps %%ymm7, %%ymm7 \n\t" - "vcvtdq2ps %%ymm8, %%ymm8 \n\t" - "vcvtdq2ps %%ymm9, %%ymm9 \n\t" - "vcvtdq2ps %%ymm10, %%ymm10 \n\t" - "vcvtdq2ps %%ymm11, %%ymm11 \n\t" - "vcvtdq2ps %%ymm12, %%ymm12 \n\t" - "vcvtdq2ps %%ymm13, %%ymm13 \n\t" - "vcvtdq2ps %%ymm14, %%ymm14 \n\t" - "vcvtdq2ps %%ymm15, %%ymm15 \n\t" - "vcvtdq2ps %%ymm16, %%ymm16 \n\t" - "vcvtdq2ps %%ymm17, %%ymm17 \n\t" - "vcvtdq2ps %%ymm18, %%ymm18 \n\t" - "vcvtdq2ps %%ymm19, %%ymm19 \n\t" - "vcvtdq2ps %%ymm20, %%ymm20 \n\t" - "vcvtdq2ps %%ymm21, %%ymm21 \n\t" - "vcvtdq2ps %%ymm22, %%ymm22 \n\t" - "vcvtdq2ps %%ymm23, %%ymm23 \n\t" - "vmulps %%ymm0, %%ymm24, %%ymm0 \n\t" - "vmulps %%ymm1, %%ymm24, %%ymm1 \n\t" - "vmulps %%ymm2, %%ymm24, %%ymm2 \n\t" - "vmulps %%ymm3, %%ymm24, %%ymm3 \n\t" - "vmulps %%ymm4, %%ymm24, %%ymm4 \n\t" - "vmulps %%ymm5, %%ymm24, %%ymm5 \n\t" - "vmulps %%ymm6, %%ymm24, %%ymm6 \n\t" - "vmulps %%ymm7, %%ymm24, %%ymm7 \n\t" - "vmulps %%ymm8, %%ymm24, %%ymm8 \n\t" - "vmulps %%ymm9, %%ymm24, %%ymm9 \n\t" - "vmulps %%ymm10, %%ymm24, %%ymm10 \n\t" - "vmulps %%ymm11, %%ymm24, %%ymm11 \n\t" - "vmulps %%ymm12, %%ymm24, %%ymm12 \n\t" - "vmulps %%ymm13, %%ymm24, %%ymm13 \n\t" - "vmulps %%ymm14, %%ymm24, %%ymm14 \n\t" - "vmulps %%ymm15, %%ymm24, %%ymm15 \n\t" - "vmulps %%ymm16, %%ymm24, %%ymm16 \n\t" - "vmulps %%ymm17, %%ymm24, %%ymm17 \n\t" - "vmulps %%ymm18, %%ymm24, %%ymm18 \n\t" - "vmulps %%ymm19, %%ymm24, %%ymm19 \n\t" - "vmulps %%ymm20, %%ymm24, %%ymm20 \n\t" - "vmulps %%ymm21, %%ymm24, %%ymm21 \n\t" - "vmulps %%ymm22, %%ymm24, %%ymm22 \n\t" - "vmulps %%ymm23, %%ymm24, %%ymm23 \n\t" - - "movq %8, %%rbx \n\t" - "andq $0x2, %%rbx \n\t" - "je 3f \n\t" - "vcvtps2dq %%zmm0, %%zmm0 \n\t" - "vcvtps2dq %%zmm1, %%zmm1 \n\t" - "vcvtps2dq %%zmm2, %%zmm2 \n\t" - "vcvtps2dq %%zmm3, %%zmm3 \n\t" - "vcvtps2dq %%zmm4, %%zmm4 \n\t" - "vcvtps2dq %%zmm5, %%zmm5 \n\t" - "vcvtps2dq %%zmm6, %%zmm6 \n\t" - "vcvtps2dq %%zmm7, %%zmm7 \n\t" - "vcvtps2dq %%zmm8, %%zmm8 \n\t" - "vcvtps2dq %%zmm9, %%zmm9 \n\t" - "vcvtps2dq %%zmm10, %%zmm10 \n\t" - "vcvtps2dq %%zmm11, %%zmm11 \n\t" - "vcvtps2dq %%zmm12, %%zmm12 \n\t" - "vcvtps2dq %%zmm13, %%zmm13 \n\t" - "vcvtps2dq %%zmm14, %%zmm14 \n\t" - "vcvtps2dq %%zmm15, %%zmm15 \n\t" - "vcvtps2dq %%zmm16, %%zmm16 \n\t" - "vcvtps2dq %%zmm17, %%zmm17 \n\t" - "vcvtps2dq %%zmm18, %%zmm18 \n\t" - "vcvtps2dq %%zmm19, %%zmm19 \n\t" - "vcvtps2dq %%zmm20, %%zmm20 \n\t" - "vcvtps2dq %%zmm21, %%zmm21 \n\t" - "vcvtps2dq %%zmm22, %%zmm22 \n\t" - "vcvtps2dq %%zmm23, %%zmm23 \n\t" - "mov $128, %%eax \n\t" - "vmovd %%eax, %%xmm25 \n\t" - "vbroadcastss %%xmm25, %%zmm24 \n\t" - "vpaddd %%zmm0, %%zmm24, %%zmm0 \n\t" - "vpaddd %%zmm1, %%zmm24, %%zmm1 \n\t" - "vpaddd %%zmm2, %%zmm24, %%zmm2 \n\t" - "vpaddd %%zmm3, %%zmm24, %%zmm3 \n\t" - "vpaddd %%zmm4, %%zmm24, %%zmm4 \n\t" - "vpaddd %%zmm5, %%zmm24, %%zmm5 \n\t" - "vpaddd %%zmm6, %%zmm24, %%zmm6 \n\t" - "vpaddd %%zmm7, %%zmm24, %%zmm7 \n\t" - "vpaddd %%zmm8, %%zmm24, %%zmm8 \n\t" - "vpaddd %%zmm9, %%zmm24, %%zmm9 \n\t" - "vpaddd %%zmm10, %%zmm24, %%zmm10 \n\t" - "vpaddd %%zmm11, %%zmm24, %%zmm11 \n\t" - "vpaddd %%zmm12, %%zmm24, %%zmm12 \n\t" - "vpaddd %%zmm13, %%zmm24, %%zmm13 \n\t" - "vpaddd %%zmm14, %%zmm24, %%zmm14 \n\t" - "vpaddd %%zmm15, %%zmm24, %%zmm15 \n\t" - "vpaddd %%zmm16, %%zmm24, %%zmm16 \n\t" - "vpaddd %%zmm17, %%zmm24, %%zmm17 \n\t" - "vpaddd %%zmm18, %%zmm24, %%zmm18 \n\t" - "vpaddd %%zmm19, %%zmm24, %%zmm19 \n\t" - "vpaddd %%zmm20, %%zmm24, %%zmm20 \n\t" - "vpaddd %%zmm21, %%zmm24, %%zmm21 \n\t" - "vpaddd %%zmm22, %%zmm24, %%zmm22 \n\t" - "vpaddd %%zmm23, %%zmm24, %%zmm23 \n\t" - "movq %9, %%rax \n\t" - "shr $2, %4 \n\t" - "movq %4, %%rcx \n\t" - "addq %4, %%rcx \n\t" - "vpmovusdb %%zmm0, (%%rax) \n\t" - "vpmovusdb %%zmm1, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm2, (%%rax) \n\t" - "vpmovusdb %%zmm3, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm4, (%%rax) \n\t" - "vpmovusdb %%zmm5, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm6, (%%rax) \n\t" - "vpmovusdb %%zmm7, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm8, (%%rax) \n\t" - "vpmovusdb %%zmm9, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm10, (%%rax) \n\t" - "vpmovusdb %%zmm11, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm12, (%%rax) \n\t" - "vpmovusdb %%zmm13, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm14, (%%rax) \n\t" - "vpmovusdb %%zmm15, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm16, (%%rax) \n\t" - "vpmovusdb %%zmm17, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm18, (%%rax) \n\t" - "vpmovusdb %%zmm19, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm20, (%%rax) \n\t" - "vpmovusdb %%zmm21, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm22, (%%rax) \n\t" - "vpmovusdb %%zmm23, (%%rax, %4) \n\t" - "jmp 4f \n\t" - - ".align 16 \n\t" - "3: \n\t" - "movq %2, %%rax \n\t" - "vmovups %%ymm0, (%%rax) \n\t" - "vmovups %%ymm1, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%ymm2, (%%rax) \n\t" - "vmovups %%ymm3, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%ymm4, (%%rax) \n\t" - "vmovups %%ymm5, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%ymm6, (%%rax) \n\t" - "vmovups %%ymm7, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%ymm8, (%%rax) \n\t" - "vmovups %%ymm9, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%ymm10, (%%rax) \n\t" - "vmovups %%ymm11, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%ymm12, (%%rax) \n\t" - "vmovups %%ymm13, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%ymm14, (%%rax) \n\t" - "vmovups %%ymm15, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%ymm16, (%%rax) \n\t" - "vmovups %%ymm17, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%ymm18, (%%rax) \n\t" - "vmovups %%ymm19, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%ymm20, (%%rax) \n\t" - "vmovups %%ymm21, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%ymm22, (%%rax) \n\t" - "vmovups %%ymm23, (%%rax, %4) \n\t" - - ".align 16 \n\t" - "4: \n\t" - : - : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((long long)(N * 4)), - "r"(scale), "r"((int64_t)stepK), "r"(offsetC), "r"((int64_t)flags), "r"(u8Result) - : "%rax", "%rbx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", - "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "%ymm16", - "%ymm17", "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22", "%ymm23", "%ymm24", "%ymm25", - "%ymm26", "%ymm27", "%ymm28", "%ymm29", "%ymm30", "%ymm31", "memory", "cc"); -} - -#ifdef _USE_AVX512_VNNI -#define mmmKernel4x48 \ - "movq %0, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm31 \n\t" \ - "prefetcht0 0xC0(%1) \n\t" \ - "prefetcht0 0x100(%1) \n\t" \ - "prefetcht0 0x140(%1) \n\t" \ - "vmovups (%1), %%zmm27 \n\t" \ - "vpdpbusd %%zmm24, %%zmm30, %%zmm0 \n\t" \ - "vpdpbusd %%zmm25, %%zmm30, %%zmm1 \n\t" \ - "vpdpbusd %%zmm26, %%zmm30, %%zmm2 \n\t" \ - "vmovups 0x40(%1), %%zmm28 \n\t" \ - "vpdpbusd %%zmm24, %%zmm31, %%zmm3 \n\t" \ - "vpdpbusd %%zmm25, %%zmm31, %%zmm4 \n\t" \ - "vpdpbusd %%zmm26, %%zmm31, %%zmm5 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %6, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm31 \n\t" \ - "vmovups 0x80(%1), %%zmm29 \n\t" \ - "vpdpbusd %%zmm24, %%zmm30, %%zmm6 \n\t" \ - "vpdpbusd %%zmm25, %%zmm30, %%zmm7 \n\t" \ - "vpdpbusd %%zmm26, %%zmm30, %%zmm8 \n\t" \ - "vpdpbusd %%zmm24, %%zmm31, %%zmm9 \n\t" \ - "vpdpbusd %%zmm25, %%zmm31, %%zmm10 \n\t" \ - "vpdpbusd %%zmm26, %%zmm31, %%zmm11 \n\t" \ - "movq %0, %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm31 \n\t" \ - "prefetcht0 0x180(%1) \n\t" \ - "prefetcht0 0x1C0(%1) \n\t" \ - "prefetcht0 0x200(%1) \n\t" \ - "vmovups 0xC0(%1), %%zmm24 \n\t" \ - "vpdpbusd %%zmm27, %%zmm30, %%zmm0 \n\t" \ - "vpdpbusd %%zmm28, %%zmm30, %%zmm1 \n\t" \ - "vpdpbusd %%zmm29, %%zmm30, %%zmm2 \n\t" \ - "vmovups 0x100(%1), %%zmm25 \n\t" \ - "vpdpbusd %%zmm27, %%zmm31, %%zmm3 \n\t" \ - "vpdpbusd %%zmm28, %%zmm31, %%zmm4 \n\t" \ - "vpdpbusd %%zmm29, %%zmm31, %%zmm5 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %6, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm31 \n\t" \ - "vmovups 0x140(%1), %%zmm26 \n\t" \ - "vpdpbusd %%zmm27, %%zmm30, %%zmm6 \n\t" \ - "vpdpbusd %%zmm28, %%zmm30, %%zmm7 \n\t" \ - "vpdpbusd %%zmm29, %%zmm30, %%zmm8 \n\t" \ - "vpdpbusd %%zmm27, %%zmm31, %%zmm9 \n\t" \ - "vpdpbusd %%zmm28, %%zmm31, %%zmm10 \n\t" \ - "vpdpbusd %%zmm29, %%zmm31, %%zmm11 \n\t" -#else -#define mmmKernel4x48 \ - "movq %0, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "prefetcht0 0xC0(%1) \n\t" \ - "prefetcht0 0x100(%1) \n\t" \ - "prefetcht0 0x140(%1) \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm30 \n\t" \ - "vpaddd %%zmm0, %%zmm27, %%zmm0 \n\t" \ - "vpaddd %%zmm1, %%zmm28, %%zmm1 \n\t" \ - "vpaddd %%zmm2, %%zmm29, %%zmm2 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %6, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpaddd %%zmm3, %%zmm27, %%zmm3 \n\t" \ - "vpaddd %%zmm4, %%zmm28, %%zmm4 \n\t" \ - "vpaddd %%zmm5, %%zmm29, %%zmm5 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm30 \n\t" \ - "vpaddd %%zmm6, %%zmm27, %%zmm6 \n\t" \ - "vpaddd %%zmm7, %%zmm28, %%zmm7 \n\t" \ - "vpaddd %%zmm8, %%zmm29, %%zmm8 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vmovups (%1), %%zmm24 \n\t" \ - "vmovups 0x40(%1), %%zmm25 \n\t" \ - "vmovups 0x80(%1), %%zmm26 \n\t" \ - "vpaddd %%zmm9, %%zmm27, %%zmm9 \n\t" \ - "vpaddd %%zmm10, %%zmm28, %%zmm10 \n\t" \ - "vpaddd %%zmm11, %%zmm29, %%zmm11 \n\t" \ - "movq %0, %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "prefetcht0 0x180(%1) \n\t" \ - "prefetcht0 0x1C0(%1) \n\t" \ - "prefetcht0 0x200(%1) \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm30 \n\t" \ - "vpaddd %%zmm0, %%zmm27, %%zmm0 \n\t" \ - "vpaddd %%zmm1, %%zmm28, %%zmm1 \n\t" \ - "vpaddd %%zmm2, %%zmm29, %%zmm2 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %6, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpaddd %%zmm3, %%zmm27, %%zmm3 \n\t" \ - "vpaddd %%zmm4, %%zmm28, %%zmm4 \n\t" \ - "vpaddd %%zmm5, %%zmm29, %%zmm5 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm30 \n\t" \ - "vpaddd %%zmm6, %%zmm27, %%zmm6 \n\t" \ - "vpaddd %%zmm7, %%zmm28, %%zmm7 \n\t" \ - "vpaddd %%zmm8, %%zmm29, %%zmm8 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vmovups 0xC0(%1), %%zmm24 \n\t" \ - "vmovups 0x100(%1), %%zmm25 \n\t" \ - "vmovups 0x140(%1), %%zmm26 \n\t" \ - "vpaddd %%zmm9, %%zmm27, %%zmm9 \n\t" \ - "vpaddd %%zmm10, %%zmm28, %%zmm10 \n\t" \ - "vpaddd %%zmm11, %%zmm29, %%zmm11 \n\t" -#endif - -inline void mmm_avx512_4x48_asm(U32 um, - U32 un, - U32 bk, - UINT8 *matrixA, - INT8 *matrixB, - I32 *matrixC, - UINT8 *u8Result, - I32 *offsetC, - U32 N, - U32 stepK, - const F32 *scale, - U32 flags) -{ - __asm__ __volatile__( - "prefetcht0 0xC0(%1) \n\t" - "prefetcht0 0x100(%1) \n\t" - "prefetcht0 0x140(%1) \n\t" - "vmovups (%1), %%zmm24 \n\t" - "vmovups 0x40(%1), %%zmm25 \n\t" - "vmovups 0x80(%1), %%zmm26 \n\t" - "add $0xC0, %1 \n\t" -#ifndef _USE_AVX512_VNNI - "mov $1, %%eax \n\t" - "vmovd %%eax, %%xmm0 \n\t" - "vpbroadcastw %%xmm0, %%zmm31 \n\t" -#endif - "movq %%rbx, %%rax \n\t" - "andq $0x1, %%rax \n\t" - "jne 0f \n\t" - "vmovups (%7), %%zmm0 \n\t" - "vmovups 0x40(%7), %%zmm1 \n\t" - "vmovups 0x80(%7), %%zmm2 \n\t" - "vmovups %%zmm0, %%zmm3 \n\t" - "vmovups %%zmm1, %%zmm4 \n\t" - "vmovups %%zmm2, %%zmm5 \n\t" - "vmovups %%zmm0, %%zmm6 \n\t" - "vmovups %%zmm1, %%zmm7 \n\t" - "vmovups %%zmm2, %%zmm8 \n\t" - "vmovups %%zmm0, %%zmm9 \n\t" - "vmovups %%zmm1, %%zmm10 \n\t" - "vmovups %%zmm2, %%zmm11 \n\t" - "jmp 1f \n\t" - - ".align 16 \n\t" - "0: \n\t" - "vxorps %%zmm0, %%zmm0, %%zmm0 \n\t" - "vxorps %%zmm1, %%zmm1, %%zmm1 \n\t" - "vxorps %%zmm2, %%zmm2, %%zmm2 \n\t" - "vxorps %%zmm3, %%zmm3, %%zmm3 \n\t" - "vxorps %%zmm4, %%zmm4, %%zmm4 \n\t" - "vxorps %%zmm5, %%zmm5, %%zmm5 \n\t" - "vxorps %%zmm6, %%zmm6, %%zmm6 \n\t" - "vxorps %%zmm7, %%zmm7, %%zmm7 \n\t" - "vxorps %%zmm8, %%zmm8, %%zmm8 \n\t" - "vxorps %%zmm9, %%zmm9, %%zmm9 \n\t" - "vxorps %%zmm10, %%zmm10, %%zmm10 \n\t" - "vxorps %%zmm11, %%zmm11, %%zmm11 \n\t" - - ".align 16 \n\t" - "1: \n\t" - "movq %2, %%rax \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 0x40(%%rax) \n\t" - "prefetcht0 0x80(%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "prefetcht0 0x40(%%rax, %4) \n\t" - "prefetcht0 0x80(%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 0x40(%%rax) \n\t" - "prefetcht0 0x80(%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "prefetcht0 0x40(%%rax, %4) \n\t" - "prefetcht0 0x80(%%rax, %4) \n\t" - - ".align 16 \n\t" - "2: \n\t" mmmKernel4x48 - - "add $0x180, %1 \n\t" - "add $0x8, %0 \n\t" - "dec %%rcx \n\t" - "jg 2b \n\t" - - "movq %2, %%rax \n\t" - "movq %4, %%rcx \n\t" - "addq %4, %%rcx \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd 0x40(%%rax), %%zmm1, %%zmm1 \n\t" - "vpaddd 0x80(%%rax), %%zmm2, %%zmm2 \n\t" - "vpaddd (%%rax, %4), %%zmm3, %%zmm3 \n\t" - "vpaddd 0x40(%%rax, %4), %%zmm4, %%zmm4 \n\t" - "vpaddd 0x80(%%rax, %4), %%zmm5, %%zmm5 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm6, %%zmm6 \n\t" - "vpaddd 0x40(%%rax), %%zmm7, %%zmm7 \n\t" - "vpaddd 0x80(%%rax), %%zmm8, %%zmm8 \n\t" - "vpaddd (%%rax, %4), %%zmm9, %%zmm9 \n\t" - "vpaddd 0x40(%%rax, %4), %%zmm10, %%zmm10 \n\t" - "vpaddd 0x80(%%rax, %4), %%zmm11, %%zmm11 \n\t" - - "cmpq $0x0, %5 \n\t" - "je 3f \n\t" - - "vbroadcastss (%5), %%zmm24 \n\t" - "vcvtdq2ps %%zmm0, %%zmm0 \n\t" - "vcvtdq2ps %%zmm1, %%zmm1 \n\t" - "vcvtdq2ps %%zmm2, %%zmm2 \n\t" - "vcvtdq2ps %%zmm3, %%zmm3 \n\t" - "vcvtdq2ps %%zmm4, %%zmm4 \n\t" - "vcvtdq2ps %%zmm5, %%zmm5 \n\t" - "vcvtdq2ps %%zmm6, %%zmm6 \n\t" - "vcvtdq2ps %%zmm7, %%zmm7 \n\t" - "vcvtdq2ps %%zmm8, %%zmm8 \n\t" - "vcvtdq2ps %%zmm9, %%zmm9 \n\t" - "vcvtdq2ps %%zmm10, %%zmm10 \n\t" - "vcvtdq2ps %%zmm11, %%zmm11 \n\t" - "vmulps %%zmm0, %%zmm24, %%zmm0 \n\t" - "vmulps %%zmm1, %%zmm24, %%zmm1 \n\t" - "vmulps %%zmm2, %%zmm24, %%zmm2 \n\t" - "vmulps %%zmm3, %%zmm24, %%zmm3 \n\t" - "vmulps %%zmm4, %%zmm24, %%zmm4 \n\t" - "vmulps %%zmm5, %%zmm24, %%zmm5 \n\t" - "vmulps %%zmm6, %%zmm24, %%zmm6 \n\t" - "vmulps %%zmm7, %%zmm24, %%zmm7 \n\t" - "vmulps %%zmm8, %%zmm24, %%zmm8 \n\t" - "vmulps %%zmm9, %%zmm24, %%zmm9 \n\t" - "vmulps %%zmm10, %%zmm24, %%zmm10 \n\t" - "vmulps %%zmm11, %%zmm24, %%zmm11 \n\t" - - "movq %%rbx, %%rax \n\t" - "andq $0x2, %%rax \n\t" - "je 3f \n\t" - "vcvtps2dq %%zmm0, %%zmm0 \n\t" - "vcvtps2dq %%zmm1, %%zmm1 \n\t" - "vcvtps2dq %%zmm2, %%zmm2 \n\t" - "vcvtps2dq %%zmm3, %%zmm3 \n\t" - "vcvtps2dq %%zmm4, %%zmm4 \n\t" - "vcvtps2dq %%zmm5, %%zmm5 \n\t" - "vcvtps2dq %%zmm6, %%zmm6 \n\t" - "vcvtps2dq %%zmm7, %%zmm7 \n\t" - "vcvtps2dq %%zmm8, %%zmm8 \n\t" - "vcvtps2dq %%zmm9, %%zmm9 \n\t" - "vcvtps2dq %%zmm10, %%zmm10 \n\t" - "vcvtps2dq %%zmm11, %%zmm11 \n\t" - "mov $128, %%eax \n\t" - "vmovd %%eax, %%xmm25 \n\t" - "vbroadcastss %%xmm25, %%zmm24 \n\t" - "vpaddd %%zmm0, %%zmm24, %%zmm0 \n\t" - "vpaddd %%zmm1, %%zmm24, %%zmm1 \n\t" - "vpaddd %%zmm2, %%zmm24, %%zmm2 \n\t" - "vpaddd %%zmm3, %%zmm24, %%zmm3 \n\t" - "vpaddd %%zmm4, %%zmm24, %%zmm4 \n\t" - "vpaddd %%zmm5, %%zmm24, %%zmm5 \n\t" - "vpaddd %%zmm6, %%zmm24, %%zmm6 \n\t" - "vpaddd %%zmm7, %%zmm24, %%zmm7 \n\t" - "vpaddd %%zmm8, %%zmm24, %%zmm8 \n\t" - "vpaddd %%zmm9, %%zmm24, %%zmm9 \n\t" - "vpaddd %%zmm10, %%zmm24, %%zmm10 \n\t" - "vpaddd %%zmm11, %%zmm24, %%zmm11 \n\t" - "movq %9, %%rax \n\t" - "shr $2, %4 \n\t" - "movq %4, %%rcx \n\t" - "addq %4, %%rcx \n\t" - "vpmovusdb %%zmm0, (%%rax) \n\t" - "vpmovusdb %%zmm1, 0x10(%%rax) \n\t" - "vpmovusdb %%zmm2, 0x20(%%rax) \n\t" - "vpmovusdb %%zmm3, (%%rax, %4) \n\t" - "vpmovusdb %%zmm4, 0x10(%%rax, %4) \n\t" - "vpmovusdb %%zmm5, 0x20(%%rax, %4) \n\t" - "add %%rcx, %%rax \n\t" - "vpmovusdb %%zmm6, (%%rax) \n\t" - "vpmovusdb %%zmm7, 0x10(%%rax) \n\t" - "vpmovusdb %%zmm8, 0x20(%%rax) \n\t" - "vpmovusdb %%zmm9, (%%rax, %4) \n\t" - "vpmovusdb %%zmm10, 0x10(%%rax, %4) \n\t" - "vpmovusdb %%zmm11, 0x20(%%rax, %4) \n\t" - "jmp 4f \n\t" - - ".align 16 \n\t" - "3: \n\t" - "movq %2, %%rax \n\t" - "vmovups %%zmm0, (%%rax) \n\t" - "vmovups %%zmm1, 0x40(%%rax) \n\t" - "vmovups %%zmm2, 0x80(%%rax) \n\t" - "vmovups %%zmm3, (%%rax, %4) \n\t" - "vmovups %%zmm4, 0x40(%%rax, %4) \n\t" - "vmovups %%zmm5, 0x80(%%rax, %4) \n\t" - "add %%rcx, %%rax \n\t" - "vmovups %%zmm6, (%%rax) \n\t" - "vmovups %%zmm7, 0x40(%%rax) \n\t" - "vmovups %%zmm8, 0x80(%%rax) \n\t" - "vmovups %%zmm9, (%%rax, %4) \n\t" - "vmovups %%zmm10, 0x40(%%rax, %4) \n\t" - "vmovups %%zmm11, 0x80(%%rax, %4) \n\t" - ".align 16 \n\t" - "4: \n\t" - : - : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((long long)(N * 4)), - "r"(scale), "r"((int64_t)stepK), "r"(offsetC), "b"((int64_t)flags), "r"(u8Result) - : "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", - "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", - "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", - "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31", "memory", "cc"); -} - -#ifdef _USE_AVX512_VNNI -#define mmmKernel6x32 \ - "movq %0, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %%rbx), %%zmm31 \n\t" \ - "prefetcht0 0x80(%1) \n\t" \ - "prefetcht0 0xC0(%1) \n\t" \ - "vmovups (%1), %%zmm26 \n\t" \ - "vpdpbusd %%zmm24, %%zmm28, %%zmm0 \n\t" \ - "vpdpbusd %%zmm25, %%zmm28, %%zmm1 \n\t" \ - "vpdpbusd %%zmm24, %%zmm29, %%zmm2 \n\t" \ - "vpdpbusd %%zmm25, %%zmm29, %%zmm3 \n\t" \ - "vpdpbusd %%zmm24, %%zmm30, %%zmm4 \n\t" \ - "vpdpbusd %%zmm25, %%zmm30, %%zmm5 \n\t" \ - "vpdpbusd %%zmm24, %%zmm31, %%zmm6 \n\t" \ - "vpdpbusd %%zmm25, %%zmm31, %%zmm7 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vmovups 0x40(%1), %%zmm27 \n\t" \ - "vpdpbusd %%zmm24, %%zmm28, %%zmm8 \n\t" \ - "vpdpbusd %%zmm25, %%zmm28, %%zmm9 \n\t" \ - "vpdpbusd %%zmm24, %%zmm29, %%zmm10 \n\t" \ - "vpdpbusd %%zmm25, %%zmm29, %%zmm11 \n\t" \ - "movq %0, %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %%rbx), %%zmm31 \n\t" \ - "prefetcht0 0x100(%1) \n\t" \ - "prefetcht0 0x140(%1) \n\t" \ - "vmovups 0x80(%1), %%zmm24 \n\t" \ - "vpdpbusd %%zmm26, %%zmm28, %%zmm0 \n\t" \ - "vpdpbusd %%zmm27, %%zmm28, %%zmm1 \n\t" \ - "vpdpbusd %%zmm26, %%zmm29, %%zmm2 \n\t" \ - "vpdpbusd %%zmm27, %%zmm29, %%zmm3 \n\t" \ - "vpdpbusd %%zmm26, %%zmm30, %%zmm4 \n\t" \ - "vpdpbusd %%zmm27, %%zmm30, %%zmm5 \n\t" \ - "vpdpbusd %%zmm26, %%zmm31, %%zmm6 \n\t" \ - "vpdpbusd %%zmm27, %%zmm31, %%zmm7 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vmovups 0xC0(%1), %%zmm25 \n\t" \ - "vpdpbusd %%zmm26, %%zmm28, %%zmm8 \n\t" \ - "vpdpbusd %%zmm27, %%zmm28, %%zmm9 \n\t" \ - "vpdpbusd %%zmm26, %%zmm29, %%zmm10 \n\t" \ - "vpdpbusd %%zmm27, %%zmm29, %%zmm11 \n\t" -#else -#define mmmKernel6x32 \ - "movq %0, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "prefetcht0 0x80(%1) \n\t" \ - "prefetcht0 0xC0(%1) \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm29, %%zmm28 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpaddd %%zmm0, %%zmm26, %%zmm0 \n\t" \ - "vpaddd %%zmm1, %%zmm27, %%zmm1 \n\t" \ - "vpaddd %%zmm2, %%zmm28, %%zmm2 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm29, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %%rbx), %%zmm29 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpaddd %%zmm3, %%zmm26, %%zmm3 \n\t" \ - "vpaddd %%zmm4, %%zmm27, %%zmm4 \n\t" \ - "vpaddd %%zmm5, %%zmm28, %%zmm5 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm29, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm29, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm28 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpaddd %%zmm6, %%zmm26, %%zmm6 \n\t" \ - "vpaddd %%zmm7, %%zmm27, %%zmm7 \n\t" \ - "vpaddd %%zmm8, %%zmm28, %%zmm8 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm29, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm29, %%zmm28 \n\t" \ - "movq %0, %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vmovups (%1), %%zmm24 \n\t" \ - "vmovups 0x40(%1), %%zmm25 \n\t" \ - "vpaddd %%zmm9, %%zmm26, %%zmm9 \n\t" \ - "vpaddd %%zmm10, %%zmm27, %%zmm10 \n\t" \ - "vpaddd %%zmm11, %%zmm28, %%zmm11 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "prefetcht0 0x100(%1) \n\t" \ - "prefetcht0 0x140(%1) \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm29, %%zmm28 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpaddd %%zmm0, %%zmm26, %%zmm0 \n\t" \ - "vpaddd %%zmm1, %%zmm27, %%zmm1 \n\t" \ - "vpaddd %%zmm2, %%zmm28, %%zmm2 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm29, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %%rbx), %%zmm29 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "addq %6, %%rax \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpaddd %%zmm3, %%zmm26, %%zmm3 \n\t" \ - "vpaddd %%zmm4, %%zmm27, %%zmm4 \n\t" \ - "vpaddd %%zmm5, %%zmm28, %%zmm5 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm29, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm29, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm28 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpaddd %%zmm6, %%zmm26, %%zmm6 \n\t" \ - "vpaddd %%zmm7, %%zmm27, %%zmm7 \n\t" \ - "vpaddd %%zmm8, %%zmm28, %%zmm8 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm29, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm29, %%zmm28 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vmovups 0x80(%1), %%zmm24 \n\t" \ - "vmovups 0xC0(%1), %%zmm25 \n\t" \ - "vpaddd %%zmm9, %%zmm26, %%zmm9 \n\t" \ - "vpaddd %%zmm10, %%zmm27, %%zmm10 \n\t" \ - "vpaddd %%zmm11, %%zmm28, %%zmm11 \n\t" -#endif - -inline void mmm_avx512_6x32_asm(U32 um, - U32 un, - U32 bk, - UINT8 *matrixA, - INT8 *matrixB, - I32 *matrixC, - UINT8 *u8Result, - I32 *offsetC, - U32 N, - U32 stepK, - const F32 *scale, - U32 flags) -{ - __asm__ __volatile__( - "prefetcht0 0x80(%1) \n\t" - "prefetcht0 0xC0(%1) \n\t" - "vmovups (%1), %%zmm24 \n\t" - "vmovups 0x40(%1), %%zmm25 \n\t" - "add $0x80, %1 \n\t" -#ifndef _USE_AVX512_VNNI - "mov $1, %%ebx \n\t" - "vmovd %%ebx, %%xmm0 \n\t" - "vpbroadcastw %%xmm0, %%zmm31 \n\t" -#endif - "movq %8, %%rbx \n\t" - "andq $0x1, %%rbx \n\t" - "jne 0f \n\t" - "vmovups (%7), %%zmm0 \n\t" - "vmovups 0x40(%7), %%zmm1 \n\t" - "vmovups %%zmm0, %%zmm2 \n\t" - "vmovups %%zmm1, %%zmm3 \n\t" - "vmovups %%zmm0, %%zmm4 \n\t" - "vmovups %%zmm1, %%zmm5 \n\t" - "vmovups %%zmm0, %%zmm6 \n\t" - "vmovups %%zmm1, %%zmm7 \n\t" - "vmovups %%zmm0, %%zmm8 \n\t" - "vmovups %%zmm1, %%zmm9 \n\t" - "vmovups %%zmm0, %%zmm10 \n\t" - "vmovups %%zmm1, %%zmm11 \n\t" - "jmp 1f \n\t" - ".align 16 \n\t" - "0: \n\t" - "vxorps %%zmm0, %%zmm0, %%zmm0 \n\t" - "vxorps %%zmm1, %%zmm1, %%zmm1 \n\t" - "vxorps %%zmm2, %%zmm2, %%zmm2 \n\t" - "vxorps %%zmm3, %%zmm3, %%zmm3 \n\t" - "vxorps %%zmm4, %%zmm4, %%zmm4 \n\t" - "vxorps %%zmm5, %%zmm5, %%zmm5 \n\t" - "vxorps %%zmm6, %%zmm6, %%zmm6 \n\t" - "vxorps %%zmm7, %%zmm7, %%zmm7 \n\t" - "vxorps %%zmm8, %%zmm8, %%zmm8 \n\t" - "vxorps %%zmm9, %%zmm9, %%zmm9 \n\t" - "vxorps %%zmm10, %%zmm10, %%zmm10 \n\t" - "vxorps %%zmm11, %%zmm11, %%zmm11 \n\t" - ".align 16 \n\t" - "1: \n\t" - "movq %2, %%rax \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 0x40(%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "prefetcht0 0x40(%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 0x40(%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "prefetcht0 0x40(%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 0x40(%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "prefetcht0 0x40(%%rax, %4) \n\t" - "movq %6, %%rbx \n\t" - "addq %6, %%rbx \n\t" - "addq %6, %%rbx \n\t" - - ".align 16 \n\t" - "2: \n\t" mmmKernel6x32 - - "add $0x100, %1 \n\t" - "add $0x8, %0 \n\t" - "dec %%rcx \n\t" - "jg 2b \n\t" - - "movq %2, %%rax \n\t" - "movq %4, %%rcx \n\t" - "addq %4, %%rcx \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd 0x40(%%rax), %%zmm1, %%zmm1 \n\t" - "vpaddd (%%rax, %4), %%zmm2, %%zmm2 \n\t" - "vpaddd 0x40(%%rax, %4), %%zmm3, %%zmm3 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm4, %%zmm4 \n\t" - "vpaddd 0x40(%%rax), %%zmm5, %%zmm5 \n\t" - "vpaddd (%%rax, %4), %%zmm6, %%zmm6 \n\t" - "vpaddd 0x40(%%rax, %4), %%zmm7, %%zmm7 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm8, %%zmm8 \n\t" - "vpaddd 0x40(%%rax), %%zmm9, %%zmm9 \n\t" - "vpaddd (%%rax, %4), %%zmm10, %%zmm10 \n\t" - "vpaddd 0x40(%%rax, %4), %%zmm11, %%zmm11 \n\t" - - "cmpq $0x0, %5 \n\t" - "je 3f \n\t" - - "vbroadcastss (%5), %%zmm24 \n\t" - "vcvtdq2ps %%zmm0, %%zmm0 \n\t" - "vcvtdq2ps %%zmm1, %%zmm1 \n\t" - "vcvtdq2ps %%zmm2, %%zmm2 \n\t" - "vcvtdq2ps %%zmm3, %%zmm3 \n\t" - "vcvtdq2ps %%zmm4, %%zmm4 \n\t" - "vcvtdq2ps %%zmm5, %%zmm5 \n\t" - "vcvtdq2ps %%zmm6, %%zmm6 \n\t" - "vcvtdq2ps %%zmm7, %%zmm7 \n\t" - "vcvtdq2ps %%zmm8, %%zmm8 \n\t" - "vcvtdq2ps %%zmm9, %%zmm9 \n\t" - "vcvtdq2ps %%zmm10, %%zmm10 \n\t" - "vcvtdq2ps %%zmm11, %%zmm11 \n\t" - "vmulps %%zmm0, %%zmm24, %%zmm0 \n\t" - "vmulps %%zmm1, %%zmm24, %%zmm1 \n\t" - "vmulps %%zmm2, %%zmm24, %%zmm2 \n\t" - "vmulps %%zmm3, %%zmm24, %%zmm3 \n\t" - "vmulps %%zmm4, %%zmm24, %%zmm4 \n\t" - "vmulps %%zmm5, %%zmm24, %%zmm5 \n\t" - "vmulps %%zmm6, %%zmm24, %%zmm6 \n\t" - "vmulps %%zmm7, %%zmm24, %%zmm7 \n\t" - "vmulps %%zmm8, %%zmm24, %%zmm8 \n\t" - "vmulps %%zmm9, %%zmm24, %%zmm9 \n\t" - "vmulps %%zmm10, %%zmm24, %%zmm10 \n\t" - "vmulps %%zmm11, %%zmm24, %%zmm11 \n\t" - - "movq %8, %%rbx \n\t" - "andq $0x2, %%rbx \n\t" - "je 3f \n\t" - "vcvtps2dq %%zmm0, %%zmm0 \n\t" - "vcvtps2dq %%zmm1, %%zmm1 \n\t" - "vcvtps2dq %%zmm2, %%zmm2 \n\t" - "vcvtps2dq %%zmm3, %%zmm3 \n\t" - "vcvtps2dq %%zmm4, %%zmm4 \n\t" - "vcvtps2dq %%zmm5, %%zmm5 \n\t" - "vcvtps2dq %%zmm6, %%zmm6 \n\t" - "vcvtps2dq %%zmm7, %%zmm7 \n\t" - "vcvtps2dq %%zmm8, %%zmm8 \n\t" - "vcvtps2dq %%zmm9, %%zmm9 \n\t" - "vcvtps2dq %%zmm10, %%zmm10 \n\t" - "vcvtps2dq %%zmm11, %%zmm11 \n\t" - "mov $128, %%eax \n\t" - "vmovd %%eax, %%xmm25 \n\t" - "vbroadcastss %%xmm25, %%zmm24 \n\t" - "vpaddd %%zmm0, %%zmm24, %%zmm0 \n\t" - "vpaddd %%zmm1, %%zmm24, %%zmm1 \n\t" - "vpaddd %%zmm2, %%zmm24, %%zmm2 \n\t" - "vpaddd %%zmm3, %%zmm24, %%zmm3 \n\t" - "vpaddd %%zmm4, %%zmm24, %%zmm4 \n\t" - "vpaddd %%zmm5, %%zmm24, %%zmm5 \n\t" - "vpaddd %%zmm6, %%zmm24, %%zmm6 \n\t" - "vpaddd %%zmm7, %%zmm24, %%zmm7 \n\t" - "vpaddd %%zmm8, %%zmm24, %%zmm8 \n\t" - "vpaddd %%zmm9, %%zmm24, %%zmm9 \n\t" - "vpaddd %%zmm10, %%zmm24, %%zmm10 \n\t" - "vpaddd %%zmm11, %%zmm24, %%zmm11 \n\t" - "movq %9, %%rax \n\t" - "shr $2, %4 \n\t" - "movq %4, %%rcx \n\t" - "addq %4, %%rcx \n\t" - "vpmovusdb %%zmm0, (%%rax) \n\t" - "vpmovusdb %%zmm1, 0x10(%%rax) \n\t" - "vpmovusdb %%zmm2, (%%rax, %4) \n\t" - "vpmovusdb %%zmm3, 0x10(%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm4, (%%rax) \n\t" - "vpmovusdb %%zmm5, 0x10(%%rax) \n\t" - "vpmovusdb %%zmm6, (%%rax, %4) \n\t" - "vpmovusdb %%zmm7, 0x10(%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm8, (%%rax) \n\t" - "vpmovusdb %%zmm9, 0x10(%%rax) \n\t" - "vpmovusdb %%zmm10, (%%rax, %4) \n\t" - "vpmovusdb %%zmm11, 0x10(%%rax, %4) \n\t" - "jmp 4f \n\t" - - ".align 16 \n\t" - "3: \n\t" - "movq %2, %%rax \n\t" - "vmovups %%zmm0, (%%rax) \n\t" - "vmovups %%zmm1, 0x40(%%rax) \n\t" - "vmovups %%zmm2, (%%rax, %4) \n\t" - "vmovups %%zmm3, 0x40(%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm4, (%%rax) \n\t" - "vmovups %%zmm5, 0x40(%%rax) \n\t" - "vmovups %%zmm6, (%%rax, %4) \n\t" - "vmovups %%zmm7, 0x40(%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm8, (%%rax) \n\t" - "vmovups %%zmm9, 0x40(%%rax) \n\t" - "vmovups %%zmm10, (%%rax, %4) \n\t" - "vmovups %%zmm11, 0x40(%%rax, %4) \n\t" - - ".align 16 \n\t" - "4: \n\t" - : - : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((long long)(N * 4)), - "r"(scale), "r"((int64_t)stepK), "r"(offsetC), "r"((int64_t)flags), "r"(u8Result) - : "%rax", "%rbx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", - "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", - "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", "%zmm24", "%zmm25", - "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31", "memory", "cc"); -} - -#ifdef _USE_AVX512_VNNI -#define mmmKernel12x16 \ - "movq %0, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "prefetcht0 0x80(%1) \n\t" \ - "vpdpbusd %%zmm24, %%zmm25, %%zmm0 \n\t" \ - "vpdpbusd %%zmm24, %%zmm26, %%zmm1 \n\t" \ - "vpdpbusd %%zmm24, %%zmm27, %%zmm2 \n\t" \ - "vpdpbusd %%zmm24, %%zmm28, %%zmm3 \n\t" \ - "vpdpbusd %%zmm24, %%zmm29, %%zmm4 \n\t" \ - "vpdpbusd %%zmm24, %%zmm30, %%zmm5 \n\t" \ - "vmovups (%1), %%zmm31 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpdpbusd %%zmm24, %%zmm25, %%zmm6 \n\t" \ - "vpdpbusd %%zmm24, %%zmm26, %%zmm7 \n\t" \ - "vpdpbusd %%zmm24, %%zmm27, %%zmm8 \n\t" \ - "vpdpbusd %%zmm24, %%zmm28, %%zmm9 \n\t" \ - "vpdpbusd %%zmm24, %%zmm29, %%zmm10 \n\t" \ - "vpdpbusd %%zmm24, %%zmm30, %%zmm11 \n\t" \ - "movq %0, %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "prefetcht0 0xC0(%1) \n\t" \ - "vpdpbusd %%zmm31, %%zmm25, %%zmm0 \n\t" \ - "vpdpbusd %%zmm31, %%zmm26, %%zmm1 \n\t" \ - "vpdpbusd %%zmm31, %%zmm27, %%zmm2 \n\t" \ - "vpdpbusd %%zmm31, %%zmm28, %%zmm3 \n\t" \ - "vpdpbusd %%zmm31, %%zmm29, %%zmm4 \n\t" \ - "vpdpbusd %%zmm31, %%zmm30, %%zmm5 \n\t" \ - "vmovups 0x40(%1), %%zmm24 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm30 \n\t" \ - "vpdpbusd %%zmm31, %%zmm25, %%zmm6 \n\t" \ - "vpdpbusd %%zmm31, %%zmm26, %%zmm7 \n\t" \ - "vpdpbusd %%zmm31, %%zmm27, %%zmm8 \n\t" \ - "vpdpbusd %%zmm31, %%zmm28, %%zmm9 \n\t" \ - "vpdpbusd %%zmm31, %%zmm29, %%zmm10 \n\t" \ - "vpdpbusd %%zmm31, %%zmm30, %%zmm11 \n\t" -#else -#define mmmKernel12x16 \ - "movq %0, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "prefetcht0 0x80(%1) \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm0, %%zmm28, %%zmm0 \n\t" \ - "vpaddd %%zmm1, %%zmm29, %%zmm1 \n\t" \ - "vpaddd %%zmm2, %%zmm30, %%zmm2 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm3, %%zmm28, %%zmm3 \n\t" \ - "vpaddd %%zmm4, %%zmm29, %%zmm4 \n\t" \ - "vpaddd %%zmm5, %%zmm30, %%zmm5 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm6, %%zmm28, %%zmm6 \n\t" \ - "vpaddd %%zmm7, %%zmm29, %%zmm7 \n\t" \ - "vpaddd %%zmm8, %%zmm30, %%zmm8 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "movq %0, %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm9, %%zmm28, %%zmm9 \n\t" \ - "vpaddd %%zmm10, %%zmm29, %%zmm10 \n\t" \ - "vpaddd %%zmm11, %%zmm30, %%zmm11 \n\t" \ - "vmovups (%1), %%zmm24 \n\t" \ - "prefetcht0 0xC0(%1) \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm0, %%zmm28, %%zmm0 \n\t" \ - "vpaddd %%zmm1, %%zmm29, %%zmm1 \n\t" \ - "vpaddd %%zmm2, %%zmm30, %%zmm2 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm3, %%zmm28, %%zmm3 \n\t" \ - "vpaddd %%zmm4, %%zmm29, %%zmm4 \n\t" \ - "vpaddd %%zmm5, %%zmm30, %%zmm5 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%zmm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%zmm27 \n\t" \ - "vpaddd %%zmm6, %%zmm28, %%zmm6 \n\t" \ - "vpaddd %%zmm7, %%zmm29, %%zmm7 \n\t" \ - "vpaddd %%zmm8, %%zmm30, %%zmm8 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm26, %%zmm29 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm27, %%zmm30 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vpmaddwd %%zmm30, %%zmm31, %%zmm30 \n\t" \ - "vmovups 0x40(%1), %%zmm24 \n\t" \ - "vpaddd %%zmm9, %%zmm28, %%zmm9 \n\t" \ - "vpaddd %%zmm10, %%zmm29, %%zmm10 \n\t" \ - "vpaddd %%zmm11, %%zmm30, %%zmm11 \n\t" -#endif - -inline void mmm_avx512_12x16_asm(U32 um, - U32 un, - U32 bk, - UINT8 *matrixA, - INT8 *matrixB, - I32 *matrixC, - UINT8 *u8Result, - I32 *offsetC, - U32 N, - U32 stepK, - const F32 *scale, - U32 flags) -{ - __asm__ __volatile__( - "prefetcht0 0x80(%1) \n\t" - "vmovups (%1), %%zmm24 \n\t" - "add $0x40, %1 \n\t" -#ifndef _USE_AVX512_VNNI - "mov $1, %%ebx \n\t" - "vmovd %%ebx, %%xmm0 \n\t" - "vpbroadcastw %%xmm0, %%zmm31 \n\t" -#endif - "movq %8, %%rbx \n\t" - "andq $0x1, %%rbx \n\t" - "jne 0f \n\t" - "vmovups (%7), %%zmm0 \n\t" - "vmovups %%zmm0, %%zmm1 \n\t" - "vmovups %%zmm0, %%zmm2 \n\t" - "vmovups %%zmm0, %%zmm3 \n\t" - "vmovups %%zmm0, %%zmm4 \n\t" - "vmovups %%zmm0, %%zmm5 \n\t" - "vmovups %%zmm0, %%zmm6 \n\t" - "vmovups %%zmm0, %%zmm7 \n\t" - "vmovups %%zmm0, %%zmm8 \n\t" - "vmovups %%zmm0, %%zmm9 \n\t" - "vmovups %%zmm0, %%zmm10 \n\t" - "vmovups %%zmm0, %%zmm11 \n\t" - "jmp 1f \n\t" - ".align 16 \n\t" - "0: \n\t" - "vxorps %%zmm0, %%zmm0, %%zmm0 \n\t" - "vxorps %%zmm1, %%zmm1, %%zmm1 \n\t" - "vxorps %%zmm2, %%zmm2, %%zmm2 \n\t" - "vxorps %%zmm3, %%zmm3, %%zmm3 \n\t" - "vxorps %%zmm4, %%zmm4, %%zmm4 \n\t" - "vxorps %%zmm5, %%zmm5, %%zmm5 \n\t" - "vxorps %%zmm6, %%zmm6, %%zmm6 \n\t" - "vxorps %%zmm7, %%zmm7, %%zmm7 \n\t" - "vxorps %%zmm8, %%zmm8, %%zmm8 \n\t" - "vxorps %%zmm9, %%zmm9, %%zmm9 \n\t" - "vxorps %%zmm10, %%zmm10, %%zmm10 \n\t" - "vxorps %%zmm11, %%zmm11, %%zmm11 \n\t" - ".align 16 \n\t" - "1: \n\t" - "movq %2, %%rax \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "movq %6, %%rbx \n\t" - "addq %6, %%rbx \n\t" - "addq %6, %%rbx \n\t" - - ".align 16 \n\t" - "2: \n\t" mmmKernel12x16 - - "add $0x80, %1 \n\t" - "add $0x8, %0 \n\t" - "dec %%rcx \n\t" - "jg 2b \n\t" - - "movq %2, %%rax \n\t" - "movq %4, %%rcx \n\t" - "addq %4, %%rcx \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd (%%rax, %4), %%zmm1, %%zmm1 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm2, %%zmm2 \n\t" - "vpaddd (%%rax, %4), %%zmm3, %%zmm3 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm4, %%zmm4 \n\t" - "vpaddd (%%rax, %4), %%zmm5, %%zmm5 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm6, %%zmm6 \n\t" - "vpaddd (%%rax, %4), %%zmm7, %%zmm7 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm8, %%zmm8 \n\t" - "vpaddd (%%rax, %4), %%zmm9, %%zmm9 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%zmm10, %%zmm10 \n\t" - "vpaddd (%%rax, %4), %%zmm11, %%zmm11 \n\t" - - "cmpq $0x0, %5 \n\t" - "je 3f \n\t" - - "vbroadcastss (%5), %%zmm24 \n\t" - "vcvtdq2ps %%zmm0, %%zmm0 \n\t" - "vcvtdq2ps %%zmm1, %%zmm1 \n\t" - "vcvtdq2ps %%zmm2, %%zmm2 \n\t" - "vcvtdq2ps %%zmm3, %%zmm3 \n\t" - "vcvtdq2ps %%zmm4, %%zmm4 \n\t" - "vcvtdq2ps %%zmm5, %%zmm5 \n\t" - "vcvtdq2ps %%zmm6, %%zmm6 \n\t" - "vcvtdq2ps %%zmm7, %%zmm7 \n\t" - "vcvtdq2ps %%zmm8, %%zmm8 \n\t" - "vcvtdq2ps %%zmm9, %%zmm9 \n\t" - "vcvtdq2ps %%zmm10, %%zmm10 \n\t" - "vcvtdq2ps %%zmm11, %%zmm11 \n\t" - "vmulps %%zmm0, %%zmm24, %%zmm0 \n\t" - "vmulps %%zmm1, %%zmm24, %%zmm1 \n\t" - "vmulps %%zmm2, %%zmm24, %%zmm2 \n\t" - "vmulps %%zmm3, %%zmm24, %%zmm3 \n\t" - "vmulps %%zmm4, %%zmm24, %%zmm4 \n\t" - "vmulps %%zmm5, %%zmm24, %%zmm5 \n\t" - "vmulps %%zmm6, %%zmm24, %%zmm6 \n\t" - "vmulps %%zmm7, %%zmm24, %%zmm7 \n\t" - "vmulps %%zmm8, %%zmm24, %%zmm8 \n\t" - "vmulps %%zmm9, %%zmm24, %%zmm9 \n\t" - "vmulps %%zmm10, %%zmm24, %%zmm10 \n\t" - "vmulps %%zmm11, %%zmm24, %%zmm11 \n\t" - - "movq %8, %%rbx \n\t" - "andq $0x2, %%rbx \n\t" - "je 3f \n\t" - "vcvtps2dq %%zmm0, %%zmm0 \n\t" - "vcvtps2dq %%zmm1, %%zmm1 \n\t" - "vcvtps2dq %%zmm2, %%zmm2 \n\t" - "vcvtps2dq %%zmm3, %%zmm3 \n\t" - "vcvtps2dq %%zmm4, %%zmm4 \n\t" - "vcvtps2dq %%zmm5, %%zmm5 \n\t" - "vcvtps2dq %%zmm6, %%zmm6 \n\t" - "vcvtps2dq %%zmm7, %%zmm7 \n\t" - "vcvtps2dq %%zmm8, %%zmm8 \n\t" - "vcvtps2dq %%zmm9, %%zmm9 \n\t" - "vcvtps2dq %%zmm10, %%zmm10 \n\t" - "vcvtps2dq %%zmm11, %%zmm11 \n\t" - "mov $128, %%eax \n\t" - "vmovd %%eax, %%xmm25 \n\t" - "vbroadcastss %%xmm25, %%zmm24 \n\t" - "vpaddd %%zmm0, %%zmm24, %%zmm0 \n\t" - "vpaddd %%zmm1, %%zmm24, %%zmm1 \n\t" - "vpaddd %%zmm2, %%zmm24, %%zmm2 \n\t" - "vpaddd %%zmm3, %%zmm24, %%zmm3 \n\t" - "vpaddd %%zmm4, %%zmm24, %%zmm4 \n\t" - "vpaddd %%zmm5, %%zmm24, %%zmm5 \n\t" - "vpaddd %%zmm6, %%zmm24, %%zmm6 \n\t" - "vpaddd %%zmm7, %%zmm24, %%zmm7 \n\t" - "vpaddd %%zmm8, %%zmm24, %%zmm8 \n\t" - "vpaddd %%zmm9, %%zmm24, %%zmm9 \n\t" - "vpaddd %%zmm10, %%zmm24, %%zmm10 \n\t" - "vpaddd %%zmm11, %%zmm24, %%zmm11 \n\t" - "movq %9, %%rax \n\t" - "shr $2, %4 \n\t" - "movq %4, %%rcx \n\t" - "addq %4, %%rcx \n\t" - "vpmovusdb %%zmm0, (%%rax) \n\t" - "vpmovusdb %%zmm1, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm2, (%%rax) \n\t" - "vpmovusdb %%zmm3, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm4, (%%rax) \n\t" - "vpmovusdb %%zmm5, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm6, (%%rax) \n\t" - "vpmovusdb %%zmm7, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm8, (%%rax) \n\t" - "vpmovusdb %%zmm9, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm10, (%%rax) \n\t" - "vpmovusdb %%zmm11, (%%rax, %4) \n\t" - "jmp 4f \n\t" - - ".align 16 \n\t" - "3: \n\t" - "movq %2, %%rax \n\t" - "vmovups %%zmm0, (%%rax) \n\t" - "vmovups %%zmm1, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm2, (%%rax) \n\t" - "vmovups %%zmm3, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm4, (%%rax) \n\t" - "vmovups %%zmm5, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm6, (%%rax) \n\t" - "vmovups %%zmm7, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm8, (%%rax) \n\t" - "vmovups %%zmm9, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%zmm10, (%%rax) \n\t" - "vmovups %%zmm11, (%%rax, %4) \n\t" - - ".align 16 \n\t" - "4: \n\t" - : - : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((int64_t)(N * 4)), - "r"(scale), "r"((int64_t)stepK), "r"(offsetC), "r"((int64_t)flags), "r"(u8Result) - : "%rax", "%rbx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", - "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", - "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", "%zmm24", "%zmm25", - "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31", "memory", "cc"); -} - -#ifdef _USE_AVX512_VNNI -#define mmmKernel12x8 \ - "movq %0, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm30 \n\t" \ - "prefetcht0 0x80(%1) \n\t" \ - "vpdpbusd %%ymm24, %%ymm25, %%ymm0 \n\t" \ - "vpdpbusd %%ymm24, %%ymm26, %%ymm1 \n\t" \ - "vpdpbusd %%ymm24, %%ymm27, %%ymm2 \n\t" \ - "vpdpbusd %%ymm24, %%ymm28, %%ymm3 \n\t" \ - "vpdpbusd %%ymm24, %%ymm29, %%ymm4 \n\t" \ - "vpdpbusd %%ymm24, %%ymm30, %%ymm5 \n\t" \ - "vmovups (%1), %%ymm31 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm30 \n\t" \ - "vpdpbusd %%ymm24, %%ymm25, %%ymm6 \n\t" \ - "vpdpbusd %%ymm24, %%ymm26, %%ymm7 \n\t" \ - "vpdpbusd %%ymm24, %%ymm27, %%ymm8 \n\t" \ - "vpdpbusd %%ymm24, %%ymm28, %%ymm9 \n\t" \ - "vpdpbusd %%ymm24, %%ymm29, %%ymm10 \n\t" \ - "vpdpbusd %%ymm24, %%ymm30, %%ymm11 \n\t" \ - "movq %0, %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm30 \n\t" \ - "vpdpbusd %%ymm31, %%ymm25, %%ymm0 \n\t" \ - "vpdpbusd %%ymm31, %%ymm26, %%ymm1 \n\t" \ - "vpdpbusd %%ymm31, %%ymm27, %%ymm2 \n\t" \ - "vpdpbusd %%ymm31, %%ymm28, %%ymm3 \n\t" \ - "vpdpbusd %%ymm31, %%ymm29, %%ymm4 \n\t" \ - "vpdpbusd %%ymm31, %%ymm30, %%ymm5 \n\t" \ - "vmovups 0x20(%1), %%ymm24 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm28 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm29 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm30 \n\t" \ - "vpdpbusd %%ymm31, %%ymm25, %%ymm6 \n\t" \ - "vpdpbusd %%ymm31, %%ymm26, %%ymm7 \n\t" \ - "vpdpbusd %%ymm31, %%ymm27, %%ymm8 \n\t" \ - "vpdpbusd %%ymm31, %%ymm28, %%ymm9 \n\t" \ - "vpdpbusd %%ymm31, %%ymm29, %%ymm10 \n\t" \ - "vpdpbusd %%ymm31, %%ymm30, %%ymm11 \n\t" -#else -#define mmmKernel12x8 \ - "movq %0, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "prefetcht0 0x80(%1) \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm0, %%ymm28, %%ymm0 \n\t" \ - "vpaddd %%ymm1, %%ymm29, %%ymm1 \n\t" \ - "vpaddd %%ymm2, %%ymm30, %%ymm2 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm3, %%ymm28, %%ymm3 \n\t" \ - "vpaddd %%ymm4, %%ymm29, %%ymm4 \n\t" \ - "vpaddd %%ymm5, %%ymm30, %%ymm5 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm6, %%ymm28, %%ymm6 \n\t" \ - "vpaddd %%ymm7, %%ymm29, %%ymm7 \n\t" \ - "vpaddd %%ymm8, %%ymm30, %%ymm8 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "movq %0, %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm9, %%ymm28, %%ymm9 \n\t" \ - "vpaddd %%ymm10, %%ymm29, %%ymm10 \n\t" \ - "vpaddd %%ymm11, %%ymm30, %%ymm11 \n\t" \ - "vmovups (%1), %%ymm24 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm0, %%ymm28, %%ymm0 \n\t" \ - "vpaddd %%ymm1, %%ymm29, %%ymm1 \n\t" \ - "vpaddd %%ymm2, %%ymm30, %%ymm2 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm3, %%ymm28, %%ymm3 \n\t" \ - "vpaddd %%ymm4, %%ymm29, %%ymm4 \n\t" \ - "vpaddd %%ymm5, %%ymm30, %%ymm5 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "addq %%rbx, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%ymm25 \n\t" \ - "vpbroadcastd (%%rax, %6), %%ymm26 \n\t" \ - "vpbroadcastd (%%rax, %6, 2), %%ymm27 \n\t" \ - "vpaddd %%ymm6, %%ymm28, %%ymm6 \n\t" \ - "vpaddd %%ymm7, %%ymm29, %%ymm7 \n\t" \ - "vpaddd %%ymm8, %%ymm30, %%ymm8 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm26, %%ymm29 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm27, %%ymm30 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vpmaddwd %%ymm30, %%ymm31, %%ymm30 \n\t" \ - "vmovups 0x20(%1), %%ymm24 \n\t" \ - "vpaddd %%ymm9, %%ymm28, %%ymm9 \n\t" \ - "vpaddd %%ymm10, %%ymm29, %%ymm10 \n\t" \ - "vpaddd %%ymm11, %%ymm30, %%ymm11 \n\t" -#endif - -inline void mmm_avx512_12x8_asm(U32 um, - U32 un, - U32 bk, - UINT8 *matrixA, - INT8 *matrixB, - I32 *matrixC, - UINT8 *u8Result, - I32 *offsetC, - U32 N, - U32 stepK, - const F32 *scale, - U32 flags) -{ - __asm__ __volatile__( - "prefetcht0 0x40(%1) \n\t" - "vmovups (%1), %%ymm24 \n\t" - "add $0x20, %1 \n\t" -#ifndef _USE_AVX512_VNNI - "mov $1, %%ebx \n\t" - "vmovd %%ebx, %%xmm0 \n\t" - "vpbroadcastw %%xmm0, %%ymm31 \n\t" -#endif - "movq %8, %%rbx \n\t" - "andq $0x1, %%rbx \n\t" - "jne 0f \n\t" - "vmovups (%7), %%ymm0 \n\t" - "vmovups %%ymm0, %%ymm1 \n\t" - "vmovups %%ymm0, %%ymm2 \n\t" - "vmovups %%ymm0, %%ymm3 \n\t" - "vmovups %%ymm0, %%ymm4 \n\t" - "vmovups %%ymm0, %%ymm5 \n\t" - "vmovups %%ymm0, %%ymm6 \n\t" - "vmovups %%ymm0, %%ymm7 \n\t" - "vmovups %%ymm0, %%ymm8 \n\t" - "vmovups %%ymm0, %%ymm9 \n\t" - "vmovups %%ymm0, %%ymm10 \n\t" - "vmovups %%ymm0, %%ymm11 \n\t" - "jmp 1f \n\t" - ".align 16 \n\t" - "0: \n\t" - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" - "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" - "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" - "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" - "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" - "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" - "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" - "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" - "vxorps %%ymm8, %%ymm8, %%ymm8 \n\t" - "vxorps %%ymm9, %%ymm9, %%ymm9 \n\t" - "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" - "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" - ".align 16 \n\t" - "1: \n\t" - "movq %2, %%rax \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "movq %6, %%rbx \n\t" - "addq %6, %%rbx \n\t" - "addq %6, %%rbx \n\t" - - ".align 16 \n\t" - "2: \n\t" mmmKernel12x8 - - "add $0x40, %1 \n\t" - "add $0x8, %0 \n\t" - "dec %%rcx \n\t" - "jg 2b \n\t" - - "movq %2, %%rax \n\t" - "movq %4, %%rcx \n\t" - "addq %4, %%rcx \n\t" - "vpaddd (%%rax), %%ymm0, %%ymm0 \n\t" - "vpaddd (%%rax, %4), %%ymm1, %%ymm1 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%ymm2, %%ymm2 \n\t" - "vpaddd (%%rax, %4), %%ymm3, %%ymm3 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%ymm4, %%ymm4 \n\t" - "vpaddd (%%rax, %4), %%ymm5, %%ymm5 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%ymm6, %%ymm6 \n\t" - "vpaddd (%%rax, %4), %%ymm7, %%ymm7 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%ymm8, %%ymm8 \n\t" - "vpaddd (%%rax, %4), %%ymm9, %%ymm9 \n\t" - "addq %%rcx, %%rax \n\t" - "vpaddd (%%rax), %%ymm10, %%ymm10 \n\t" - "vpaddd (%%rax, %4), %%ymm11, %%ymm11 \n\t" - - "cmpq $0x0, %5 \n\t" - "je 3f \n\t" - - "vbroadcastss (%5), %%ymm24 \n\t" - "vcvtdq2ps %%ymm0, %%ymm0 \n\t" - "vcvtdq2ps %%ymm1, %%ymm1 \n\t" - "vcvtdq2ps %%ymm2, %%ymm2 \n\t" - "vcvtdq2ps %%ymm3, %%ymm3 \n\t" - "vcvtdq2ps %%ymm4, %%ymm4 \n\t" - "vcvtdq2ps %%ymm5, %%ymm5 \n\t" - "vcvtdq2ps %%ymm6, %%ymm6 \n\t" - "vcvtdq2ps %%ymm7, %%ymm7 \n\t" - "vcvtdq2ps %%ymm8, %%ymm8 \n\t" - "vcvtdq2ps %%ymm9, %%ymm9 \n\t" - "vcvtdq2ps %%ymm10, %%ymm10 \n\t" - "vcvtdq2ps %%ymm11, %%ymm11 \n\t" - "vmulps %%ymm0, %%ymm24, %%ymm0 \n\t" - "vmulps %%ymm1, %%ymm24, %%ymm1 \n\t" - "vmulps %%ymm2, %%ymm24, %%ymm2 \n\t" - "vmulps %%ymm3, %%ymm24, %%ymm3 \n\t" - "vmulps %%ymm4, %%ymm24, %%ymm4 \n\t" - "vmulps %%ymm5, %%ymm24, %%ymm5 \n\t" - "vmulps %%ymm6, %%ymm24, %%ymm6 \n\t" - "vmulps %%ymm7, %%ymm24, %%ymm7 \n\t" - "vmulps %%ymm8, %%ymm24, %%ymm8 \n\t" - "vmulps %%ymm9, %%ymm24, %%ymm9 \n\t" - "vmulps %%ymm10, %%ymm24, %%ymm10 \n\t" - "vmulps %%ymm11, %%ymm24, %%ymm11 \n\t" - - "movq %8, %%rbx \n\t" - "andq $0x2, %%rbx \n\t" - "je 3f \n\t" - "vcvtps2dq %%zmm0, %%zmm0 \n\t" - "vcvtps2dq %%zmm1, %%zmm1 \n\t" - "vcvtps2dq %%zmm2, %%zmm2 \n\t" - "vcvtps2dq %%zmm3, %%zmm3 \n\t" - "vcvtps2dq %%zmm4, %%zmm4 \n\t" - "vcvtps2dq %%zmm5, %%zmm5 \n\t" - "vcvtps2dq %%zmm6, %%zmm6 \n\t" - "vcvtps2dq %%zmm7, %%zmm7 \n\t" - "vcvtps2dq %%zmm8, %%zmm8 \n\t" - "vcvtps2dq %%zmm9, %%zmm9 \n\t" - "vcvtps2dq %%zmm10, %%zmm10 \n\t" - "vcvtps2dq %%zmm11, %%zmm11 \n\t" - "mov $128, %%eax \n\t" - "vmovd %%eax, %%xmm25 \n\t" - "vbroadcastss %%xmm25, %%zmm24 \n\t" - "vpaddd %%zmm0, %%zmm24, %%zmm0 \n\t" - "vpaddd %%zmm1, %%zmm24, %%zmm1 \n\t" - "vpaddd %%zmm2, %%zmm24, %%zmm2 \n\t" - "vpaddd %%zmm3, %%zmm24, %%zmm3 \n\t" - "vpaddd %%zmm4, %%zmm24, %%zmm4 \n\t" - "vpaddd %%zmm5, %%zmm24, %%zmm5 \n\t" - "vpaddd %%zmm6, %%zmm24, %%zmm6 \n\t" - "vpaddd %%zmm7, %%zmm24, %%zmm7 \n\t" - "vpaddd %%zmm8, %%zmm24, %%zmm8 \n\t" - "vpaddd %%zmm9, %%zmm24, %%zmm9 \n\t" - "vpaddd %%zmm10, %%zmm24, %%zmm10 \n\t" - "vpaddd %%zmm11, %%zmm24, %%zmm11 \n\t" - "movq %9, %%rax \n\t" - "shr $2, %4 \n\t" - "movq %4, %%rcx \n\t" - "addq %4, %%rcx \n\t" - "vpmovusdb %%zmm0, (%%rax) \n\t" - "vpmovusdb %%zmm1, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm2, (%%rax) \n\t" - "vpmovusdb %%zmm3, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm4, (%%rax) \n\t" - "vpmovusdb %%zmm5, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm6, (%%rax) \n\t" - "vpmovusdb %%zmm7, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm8, (%%rax) \n\t" - "vpmovusdb %%zmm9, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vpmovusdb %%zmm10, (%%rax) \n\t" - "vpmovusdb %%zmm11, (%%rax, %4) \n\t" - "jmp 4f \n\t" - - ".align 16 \n\t" - "3: \n\t" - "movq %2, %%rax \n\t" - "vmovups %%ymm0, (%%rax) \n\t" - "vmovups %%ymm1, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%ymm2, (%%rax) \n\t" - "vmovups %%ymm3, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%ymm4, (%%rax) \n\t" - "vmovups %%ymm5, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%ymm6, (%%rax) \n\t" - "vmovups %%ymm7, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%ymm8, (%%rax) \n\t" - "vmovups %%ymm9, (%%rax, %4) \n\t" - "addq %%rcx, %%rax \n\t" - "vmovups %%ymm10, (%%rax) \n\t" - "vmovups %%ymm11, (%%rax, %4) \n\t" - - ".align 16 \n\t" - "4: \n\t" - : - : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((long long)(N * 4)), - "r"(scale), "r"((int64_t)stepK), "r"(offsetC), "r"((int64_t)flags), "r"(u8Result) - : "%rax", "%rbx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", - "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "%ymm16", - "%ymm17", "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22", "%ymm23", "%ymm24", "%ymm25", - "%ymm26", "%ymm27", "%ymm28", "%ymm29", "%ymm30", "%ymm31", "memory", "cc"); -} - -#ifdef _USE_AVX512_VNNI -#define mmmKernel1x48 \ - "movq %0, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "vpbroadcastd 0x4(%%rax), %%zmm31 \n\t" \ - "prefetcht0 0xC0(%1) \n\t" \ - "prefetcht0 0x100(%1) \n\t" \ - "prefetcht0 0x140(%1) \n\t" \ - "vmovups (%1), %%zmm27 \n\t" \ - "vmovups 0x40(%1), %%zmm28 \n\t" \ - "vmovups 0x80(%1), %%zmm29 \n\t" \ - "vpdpbusd %%zmm24, %%zmm30, %%zmm0 \n\t" \ - "vpdpbusd %%zmm25, %%zmm30, %%zmm1 \n\t" \ - "vpdpbusd %%zmm26, %%zmm30, %%zmm2 \n\t" \ - "prefetcht0 0x180(%1) \n\t" \ - "prefetcht0 0x1C0(%1) \n\t" \ - "prefetcht0 0x200(%1) \n\t" \ - "vmovups 0xC0(%1), %%zmm24 \n\t" \ - "vmovups 0x100(%1), %%zmm25 \n\t" \ - "vmovups 0x140(%1), %%zmm26 \n\t" \ - "vpdpbusd %%zmm27, %%zmm31, %%zmm0 \n\t" \ - "vpdpbusd %%zmm28, %%zmm31, %%zmm1 \n\t" \ - "vpdpbusd %%zmm29, %%zmm31, %%zmm2 \n\t" -#else -#define mmmKernel1x48 \ - "movq %0, %%rax \n\t" \ - "vpbroadcastd (%%rax), %%zmm30 \n\t" \ - "prefetcht0 0xC0(%1) \n\t" \ - "prefetcht0 0x100(%1) \n\t" \ - "prefetcht0 0x140(%1) \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vmovups (%1), %%zmm24 \n\t" \ - "vmovups 0x40(%1), %%zmm25 \n\t" \ - "vmovups 0x80(%1), %%zmm26 \n\t" \ - "vpbroadcastd 0x4(%%rax), %%zmm30 \n\t" \ - "vpaddd %%zmm0, %%zmm27, %%zmm0 \n\t" \ - "vpaddd %%zmm1, %%zmm28, %%zmm1 \n\t" \ - "vpaddd %%zmm2, %%zmm29, %%zmm2 \n\t" \ - "prefetcht0 0x180(%1) \n\t" \ - "prefetcht0 0x1C0(%1) \n\t" \ - "prefetcht0 0x200(%1) \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm27 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm26, %%zmm30, %%zmm29 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "vmovups 0xC0(%1), %%zmm24 \n\t" \ - "vmovups 0x100(%1), %%zmm25 \n\t" \ - "vmovups 0x140(%1), %%zmm26 \n\t" \ - "vpaddd %%zmm0, %%zmm27, %%zmm0 \n\t" \ - "vpaddd %%zmm1, %%zmm28, %%zmm1 \n\t" \ - "vpaddd %%zmm2, %%zmm29, %%zmm2 \n\t" -#endif - -inline void mmm_avx512_1x48_asm(U32 um, - U32 un, - U32 bk, - UINT8 *matrixA, - INT8 *matrixB, - I32 *matrixC, - UINT8 *u8Result, - I32 *offsetC, - U32 N, - U32 stepK, - const F32 *scale, - U32 flags) -{ - __asm__ __volatile__( - "prefetcht0 0xC0(%1) \n\t" - "prefetcht0 0x100(%1) \n\t" - "prefetcht0 0x140(%1) \n\t" - "vmovups (%1), %%zmm24 \n\t" - "vmovups 0x40(%1), %%zmm25 \n\t" - "vmovups 0x80(%1), %%zmm26 \n\t" - "add $0xC0, %1 \n\t" -#ifndef _USE_AVX512_VNNI - "mov $1, %%eax \n\t" - "vmovd %%eax, %%xmm0 \n\t" - "vpbroadcastw %%xmm0, %%zmm31 \n\t" -#endif - "movq %%rbx, %%rax \n\t" - "andq $0x1, %%rax \n\t" - "jne 0f \n\t" - "vmovups (%6), %%zmm0 \n\t" - "vmovups 0x40(%6), %%zmm1 \n\t" - "vmovups 0x80(%6), %%zmm2 \n\t" - "jmp 1f \n\t" - - ".align 16 \n\t" - "0: \n\t" - "vxorps %%zmm0, %%zmm0, %%zmm0 \n\t" - "vxorps %%zmm1, %%zmm1, %%zmm1 \n\t" - "vxorps %%zmm2, %%zmm2, %%zmm2 \n\t" - - ".align 16 \n\t" - "1: \n\t" - "movq %2, %%rax \n\t" - "add %4, %%rax \n\t" - "prefetcht0 (%%rax) \n\t" - "prefetcht0 0x40(%%rax) \n\t" - "prefetcht0 0x80(%%rax) \n\t" - "prefetcht0 (%%rax, %4) \n\t" - "prefetcht0 0x40(%%rax, %4) \n\t" - "prefetcht0 0x80(%%rax, %4) \n\t" - - ".align 16 \n\t" - "2: \n\t" mmmKernel1x48 - - "add $0x180, %1 \n\t" - "add $0x8, %0 \n\t" - "dec %%rcx \n\t" - "jg 2b \n\t" - - "vpaddd (%2), %%zmm0, %%zmm0 \n\t" - "vpaddd 0x40(%2), %%zmm1, %%zmm1 \n\t" - "vpaddd 0x80(%2), %%zmm2, %%zmm2 \n\t" - - "cmpq $0x0, %5 \n\t" - "je 3f \n\t" - - "vbroadcastss (%5), %%zmm24 \n\t" - "vcvtdq2ps %%zmm0, %%zmm0 \n\t" - "vcvtdq2ps %%zmm1, %%zmm1 \n\t" - "vcvtdq2ps %%zmm2, %%zmm2 \n\t" - "vmulps %%zmm0, %%zmm24, %%zmm0 \n\t" - "vmulps %%zmm1, %%zmm24, %%zmm1 \n\t" - "vmulps %%zmm2, %%zmm24, %%zmm2 \n\t" - - "movq %%rbx, %%rax \n\t" - "andq $0x2, %%rax \n\t" - "je 3f \n\t" - "vcvtps2dq %%zmm0, %%zmm0 \n\t" - "vcvtps2dq %%zmm1, %%zmm1 \n\t" - "vcvtps2dq %%zmm2, %%zmm2 \n\t" - "mov $128, %%eax \n\t" - "vmovd %%eax, %%xmm25 \n\t" - "vbroadcastss %%xmm25, %%zmm24 \n\t" - "vpaddd %%zmm0, %%zmm24, %%zmm0 \n\t" - "vpaddd %%zmm1, %%zmm24, %%zmm1 \n\t" - "vpaddd %%zmm2, %%zmm24, %%zmm2 \n\t" - "vpmovusdb %%zmm0, (%8) \n\t" - "vpmovusdb %%zmm1, 0x10(%8) \n\t" - "vpmovusdb %%zmm2, 0x20(%8) \n\t" - "jmp 4f \n\t" - - ".align 16 \n\t" - "3: \n\t" - "vmovups %%zmm0, (%2) \n\t" - "vmovups %%zmm1, 0x40(%2) \n\t" - "vmovups %%zmm2, 0x80(%2) \n\t" - ".align 16 \n\t" - "4: \n\t" - : - : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((long long)(N * 4)), - "r"(scale), "r"(offsetC), "b"((int64_t)flags), "r"(u8Result) - : "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", - "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", - "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", - "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31", "memory", "cc"); -} - -#ifdef _USE_AVX512_VNNI -#define mmmKernel1x32 \ - "vpbroadcastd (%0), %%zmm28 \n\t" \ - "vpbroadcastd 0x4(%0), %%zmm29 \n\t" \ - "prefetcht0 0x80(%1) \n\t" \ - "prefetcht0 0xC0(%1) \n\t" \ - "vmovups (%1), %%zmm26 \n\t" \ - "vmovups 0x40(%1), %%zmm27 \n\t" \ - "vpdpbusd %%zmm24, %%zmm28, %%zmm0 \n\t" \ - "vpdpbusd %%zmm25, %%zmm28, %%zmm1 \n\t" \ - "prefetcht0 0x100(%1) \n\t" \ - "prefetcht0 0x140(%1) \n\t" \ - "vmovups 0x80(%1), %%zmm24 \n\t" \ - "vmovups 0xC0(%1), %%zmm25 \n\t" \ - "vpdpbusd %%zmm26, %%zmm29, %%zmm0 \n\t" \ - "vpdpbusd %%zmm27, %%zmm29, %%zmm1 \n\t" -#else -#define mmmKernel1x32 \ - "vpbroadcastd (%0), %%zmm30 \n\t" \ - "prefetcht0 0x80(%1) \n\t" \ - "prefetcht0 0xC0(%1) \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm27 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vpbroadcastd 0x4(%0), %%zmm30 \n\t" \ - "vmovups (%1), %%zmm24 \n\t" \ - "vmovups 0x40(%1), %%zmm25 \n\t" \ - "vpaddd %%zmm0, %%zmm26, %%zmm0 \n\t" \ - "vpaddd %%zmm1, %%zmm27, %%zmm1 \n\t" \ - "prefetcht0 0x100(%1) \n\t" \ - "prefetcht0 0x140(%1) \n\t" \ - "vpmaddubsw %%zmm24, %%zmm30, %%zmm26 \n\t" \ - "vpmaddubsw %%zmm25, %%zmm30, %%zmm27 \n\t" \ - "vpmaddwd %%zmm26, %%zmm31, %%zmm26 \n\t" \ - "vpmaddwd %%zmm27, %%zmm31, %%zmm27 \n\t" \ - "vmovups 0x80(%1), %%zmm24 \n\t" \ - "vmovups 0xC0(%1), %%zmm25 \n\t" \ - "vpaddd %%zmm0, %%zmm26, %%zmm0 \n\t" \ - "vpaddd %%zmm1, %%zmm27, %%zmm1 \n\t" -#endif - -inline void mmm_avx512_1x32_asm(U32 um, - U32 un, - U32 bk, - UINT8 *matrixA, - INT8 *matrixB, - I32 *matrixC, - UINT8 *u8Result, - I32 *offsetC, - U32 N, - U32 stepK, - const F32 *scale, - U32 flags) -{ - __asm__ __volatile__( - "prefetcht0 0x80(%1) \n\t" - "prefetcht0 0xC0(%1) \n\t" - "vmovups (%1), %%zmm24 \n\t" - "vmovups 0x40(%1), %%zmm25 \n\t" - "add $0x80, %1 \n\t" -#ifndef _USE_AVX512_VNNI - "mov $1, %%eax \n\t" - "vmovd %%eax, %%xmm0 \n\t" - "vpbroadcastw %%xmm0, %%zmm31 \n\t" -#endif - "movq %%rbx, %%rax \n\t" - "andq $0x1, %%rax \n\t" - "jne 0f \n\t" - "vmovups (%6), %%zmm0 \n\t" - "vmovups 0x40(%6), %%zmm1 \n\t" - "jmp 1f \n\t" - - ".align 16 \n\t" - "0: \n\t" - "vxorps %%zmm0, %%zmm0, %%zmm0 \n\t" - "vxorps %%zmm1, %%zmm1, %%zmm1 \n\t" - - ".align 16 \n\t" - "1: \n\t" mmmKernel1x32 - - "add $0x100, %1 \n\t" - "add $0x8, %0 \n\t" - "dec %%rcx \n\t" - "jg 1b \n\t" - - "vpaddd (%2), %%zmm0, %%zmm0 \n\t" - "vpaddd 0x40(%2), %%zmm1, %%zmm1 \n\t" - - "cmpq $0x0, %5 \n\t" - "je 2f \n\t" - - "vbroadcastss (%5), %%zmm24 \n\t" - "vcvtdq2ps %%zmm0, %%zmm0 \n\t" - "vcvtdq2ps %%zmm1, %%zmm1 \n\t" - "vmulps %%zmm0, %%zmm24, %%zmm0 \n\t" - "vmulps %%zmm1, %%zmm24, %%zmm1 \n\t" - - "movq %%rbx, %%rax \n\t" - "andq $0x2, %%rax \n\t" - "je 2f \n\t" - "vcvtps2dq %%zmm0, %%zmm0 \n\t" - "vcvtps2dq %%zmm1, %%zmm1 \n\t" - "mov $128, %%eax \n\t" - "vmovd %%eax, %%xmm25 \n\t" - "vbroadcastss %%xmm25, %%zmm24 \n\t" - "vpaddd %%zmm0, %%zmm24, %%zmm0 \n\t" - "vpaddd %%zmm1, %%zmm24, %%zmm1 \n\t" - "vpmovusdb %%zmm0, (%8) \n\t" - "vpmovusdb %%zmm1, 0x10(%8) \n\t" - "jmp 3f \n\t" - - ".align 16 \n\t" - "2: \n\t" - "vmovups %%zmm0, (%2) \n\t" - "vmovups %%zmm1, 0x40(%2) \n\t" - - ".align 16 \n\t" - "3: \n\t" - : - : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((long long)(N * 4)), - "r"(scale), "r"(offsetC), "b"((int64_t)flags), "r"(u8Result) - : "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", - "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", - "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", - "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31", "memory", "cc"); -} - -#ifdef _USE_AVX512_VNNI -#define mmmKernel1x16 \ - "vpbroadcastd (%0), %%zmm25 \n\t" \ - "vpbroadcastd 0x4(%0), %%zmm26 \n\t" \ - "prefetcht0 0x80(%1) \n\t" \ - "vmovups (%1), %%zmm31 \n\t" \ - "vpdpbusd %%zmm24, %%zmm25, %%zmm0 \n\t" \ - "prefetcht0 0xC0(%1) \n\t" \ - "vmovups 0x40(%1), %%zmm24 \n\t" \ - "vpdpbusd %%zmm31, %%zmm26, %%zmm0 \n\t" -#else -#define mmmKernel1x16 \ - "vpbroadcastd (%0), %%zmm25 \n\t" \ - "vpbroadcastd 0x4(%0), %%zmm26 \n\t" \ - "prefetcht0 0x80(%1) \n\t" \ - "vmovups (%1), %%zmm30 \n\t" \ - "vpmaddubsw %%zmm24, %%zmm25, %%zmm28 \n\t" \ - "vpmaddubsw %%zmm30, %%zmm26, %%zmm29 \n\t" \ - "vpmaddwd %%zmm28, %%zmm31, %%zmm28 \n\t" \ - "vpmaddwd %%zmm29, %%zmm31, %%zmm29 \n\t" \ - "prefetcht0 0xC0(%1) \n\t" \ - "vmovups 0x40(%1), %%zmm24 \n\t" \ - "vpaddd %%zmm0, %%zmm28, %%zmm0 \n\t" \ - "vpaddd %%zmm0, %%zmm29, %%zmm0 \n\t" -#endif - -inline void mmm_avx512_1x16_asm(U32 um, - U32 un, - U32 bk, - UINT8 *matrixA, - INT8 *matrixB, - I32 *matrixC, - UINT8 *u8Result, - I32 *offsetC, - U32 N, - U32 stepK, - const F32 *scale, - U32 flags) -{ - __asm__ __volatile__( - "prefetcht0 0x80(%1) \n\t" - "vmovups (%1), %%zmm24 \n\t" - "add $0x40, %1 \n\t" -#ifndef _USE_AVX512_VNNI - "mov $1, %%eax \n\t" - "vmovd %%eax, %%xmm0 \n\t" - "vpbroadcastw %%xmm0, %%zmm31 \n\t" -#endif - "movq %%rbx, %%rax \n\t" - "andq $0x1, %%rax \n\t" - "jne 0f \n\t" - "vmovups (%6), %%zmm0 \n\t" - "jmp 1f \n\t" - - ".align 16 \n\t" - "0: \n\t" - "vxorps %%zmm0, %%zmm0, %%zmm0 \n\t" - - ".align 16 \n\t" - "1: \n\t" mmmKernel1x16 - - "add $0x80, %1 \n\t" - "add $0x8, %0 \n\t" - "dec %%rcx \n\t" - "jg 1b \n\t" - - "vpaddd (%2), %%zmm0, %%zmm0 \n\t" - - "cmpq $0x0, %5 \n\t" - "je 2f \n\t" - - "vbroadcastss (%5), %%zmm24 \n\t" - "vcvtdq2ps %%zmm0, %%zmm0 \n\t" - "vmulps %%zmm0, %%zmm24, %%zmm0 \n\t" - - "movq %%rbx, %%rax \n\t" - "andq $0x2, %%rax \n\t" - "je 2f \n\t" - "vcvtps2dq %%zmm0, %%zmm0 \n\t" - "mov $128, %%eax \n\t" - "vmovd %%eax, %%xmm25 \n\t" - "vbroadcastss %%xmm25, %%zmm24 \n\t" - "vpaddd %%zmm0, %%zmm24, %%zmm0 \n\t" - "vpmovusdb %%zmm0, (%8) \n\t" - "jmp 3f \n\t" - - ".align 16 \n\t" - "2: \n\t" - "vmovups %%zmm0, (%2) \n\t" - - ".align 16 \n\t" - "3: \n\t" - : - : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((long long)(N * 4)), - "r"(scale), "r"(offsetC), "b"((int64_t)flags), "r"(u8Result) - : "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", - "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", - "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", "%zmm24", "%zmm25", "%zmm26", - "%zmm27", "%zmm28", "%zmm29", "%zmm30", "%zmm31", "memory", "cc"); -} - -#ifdef _USE_AVX512_VNNI -#define mmmKernel1x8 \ - "vpbroadcastd (%0), %%ymm25 \n\t" \ - "vpbroadcastd 0x4(%0), %%ymm26 \n\t" \ - "prefetcht0 0x40(%1) \n\t" \ - "vmovups (%1), %%ymm31 \n\t" \ - "vpdpbusd %%ymm24, %%ymm25, %%ymm0 \n\t" \ - "vmovups 0x20(%1), %%ymm24 \n\t" \ - "vpdpbusd %%ymm31, %%ymm26, %%ymm0 \n\t" -#else -#define mmmKernel1x8 \ - "vpbroadcastd (%0), %%ymm25 \n\t" \ - "vpbroadcastd 0x4(%0), %%ymm26 \n\t" \ - "prefetcht0 0x80(%1) \n\t" \ - "vmovups (%1), %%ymm30 \n\t" \ - "vpmaddubsw %%ymm24, %%ymm25, %%ymm28 \n\t" \ - "vpmaddubsw %%ymm30, %%ymm26, %%ymm29 \n\t" \ - "vpmaddwd %%ymm28, %%ymm31, %%ymm28 \n\t" \ - "vpmaddwd %%ymm29, %%ymm31, %%ymm29 \n\t" \ - "vmovups 0x20(%1), %%ymm24 \n\t" \ - "vpaddd %%ymm0, %%ymm28, %%ymm0 \n\t" \ - "vpaddd %%ymm0, %%ymm29, %%ymm0 \n\t" -#endif - -inline void mmm_avx512_1x8_asm(U32 um, - U32 un, - U32 bk, - UINT8 *matrixA, - INT8 *matrixB, - I32 *matrixC, - UINT8 *u8Result, - I32 *offsetC, - U32 N, - U32 stepK, - const F32 *scale, - U32 flags) -{ - __asm__ __volatile__( - "prefetcht0 0x40(%1) \n\t" - "vmovups (%1), %%ymm24 \n\t" - "add $0x20, %1 \n\t" -#ifndef _USE_AVX512_VNNI - "mov $1, %%eax \n\t" - "vmovd %%eax, %%xmm0 \n\t" - "vpbroadcastw %%xmm0, %%ymm31 \n\t" -#endif - "movq %%rbx, %%rax \n\t" - "andq $0x1, %%rax \n\t" - "jne 0f \n\t" - "vmovups (%6), %%ymm0 \n\t" - "jmp 1f \n\t" - - ".align 16 \n\t" - "0: \n\t" - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" - - ".align 16 \n\t" - "1: \n\t" mmmKernel1x8 - - "add $0x40, %1 \n\t" - "add $0x8, %0 \n\t" - "dec %%rcx \n\t" - "jg 1b \n\t" - - "vpaddd (%2), %%ymm0, %%ymm0 \n\t" - - "cmpq $0x0, %5 \n\t" - "je 2f \n\t" - - "vbroadcastss (%5), %%ymm24 \n\t" - "vcvtdq2ps %%ymm0, %%ymm0 \n\t" - "vmulps %%ymm0, %%ymm24, %%ymm0 \n\t" - "movq %%rbx, %%rax \n\t" - "andq $0x2, %%rax \n\t" - "je 2f \n\t" - "vcvtps2dq %%ymm0, %%ymm0 \n\t" - "mov $128, %%eax \n\t" - "vmovd %%eax, %%xmm25 \n\t" - "vbroadcastss %%xmm25, %%ymm24 \n\t" - "vpaddd %%ymm0, %%ymm24, %%ymm0 \n\t" - "vpmovusdb %%ymm0, (%8) \n\t" - "jmp 3f \n\t" - - ".align 16 \n\t" - "2: \n\t" - "vmovups %%ymm0, (%2) \n\t" - - ".align 16 \n\t" - "3: \n\t" - : - : "r"(matrixA), "r"(matrixB), "r"(matrixC), "c"((int64_t)bk), "r"((long long)(N * 4)), - "r"(scale), "r"(offsetC), "b"((int64_t)flags), "r"(u8Result) - : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", - "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "%ymm16", "%ymm17", - "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22", "%ymm23", "%ymm24", "%ymm25", "%ymm26", - "%ymm27", "%ymm28", "%ymm29", "%ymm30", "%ymm31", "memory", "cc"); -} - -void mmm_avx512_n_mtail(U32 um, - U32 un, - U32 bk, - UINT8 *matrixA, - INT8 *matrixB, - I32 *matrixC, - UINT8 *u8Result, - I32 *offsetC, - U32 N, - U32 stepK, - const F32 *scale, - U32 flags) -{ - I32 *result = (I32 *)matrixC; - F32 *resultF32 = (F32 *)matrixC; - for (U32 i = 0; i < um; ++i) { - for (U32 j = 0; j < un; ++j) { - I32 tmp = result[i * N + j]; - for (U32 k = 0; k < bk * 8; k += 4) { - if (((flags & 0x1) == 0) && (k == 0)) { - tmp += offsetC[j]; - } - for (U32 k4 = 0; k4 < 4; ++k4) { - tmp += (int)matrixA[i * stepK + k4 + k] * (int)matrixB[k * un + j * 4 + k4]; - } - } - if (scale != nullptr) { - resultF32[i * N + j] = tmp * scale[0]; - if ((flags & 0x2) != 0) { - tmp = (I32)(resultF32[i * N + j] + 128); - u8Result[i * N + j] = (tmp > 255) ? 255 : tmp; - } - } else { - result[i * N + j] = tmp; - } - } - } -} - //TODO: matrixC alloc EE mmm_avx512_vnni_int8(U32 N, U32 M, @@ -4836,38 +2112,50 @@ EE mmm_avx512_vnni_int8(U32 N, const F32 *scale) { UINT8 *packA = matrix1; - kernel_func kernel[3][5] = {{mmm_avx512_n_mtail, mmm_avx512_1x8_asm, mmm_avx512_1x16_asm, - mmm_avx512_1x32_asm, mmm_avx512_1x48_asm}, - {mmm_avx512_n_mtail, mmm_avx512_12x8_asm, mmm_avx512_12x16_asm, mmm_avx512_6x32_asm, - mmm_avx512_4x48_asm}, - {mmm_avx512_n_mtail, mmm_avx512_24x8_asm, mmm_avx512_24x16_asm, mmm_avx512_12x32_asm, - mmm_avx512_8x48_asm}}; - U32 unrollNSizes[5] = {8, 8, 16, 32, 48}; - U32 unrollMSize[5] = {M, 24, 24, 12, 8}; + kernel_func kernel[24][4] = { + {mmm_avx512_1x8_asm, mmm_avx512_1x16_asm, mmm_avx512_1x32_asm, mmm_avx512_1x48_asm}, + {mmm_avx512_2x8_asm, mmm_avx512_2x16_asm, mmm_avx512_2x32_asm, mmm_avx512_2x48_asm}, + {mmm_avx512_3x8_asm, mmm_avx512_3x16_asm, mmm_avx512_3x32_asm, mmm_avx512_3x48_asm}, + {mmm_avx512_4x8_asm, mmm_avx512_4x16_asm, mmm_avx512_4x32_asm, mmm_avx512_4x48_asm}, + {mmm_avx512_5x8_asm, mmm_avx512_5x16_asm, mmm_avx512_5x32_asm, mmm_avx512_5x48_asm}, + {mmm_avx512_6x8_asm, mmm_avx512_6x16_asm, mmm_avx512_6x32_asm, mmm_avx512_6x48_asm}, + {mmm_avx512_7x8_asm, mmm_avx512_7x16_asm, mmm_avx512_7x32_asm, mmm_avx512_7x48_asm}, + {mmm_avx512_8x8_asm, mmm_avx512_8x16_asm, mmm_avx512_8x32_asm, mmm_avx512_8x48_asm}, + {mmm_avx512_9x8_asm, mmm_avx512_9x16_asm, mmm_avx512_9x32_asm, mmm_avx512_8x48_asm}, + {mmm_avx512_10x8_asm, mmm_avx512_10x16_asm, mmm_avx512_10x32_asm, mmm_avx512_8x48_asm}, + {mmm_avx512_11x8_asm, mmm_avx512_11x16_asm, mmm_avx512_11x32_asm, mmm_avx512_8x48_asm}, + {mmm_avx512_12x8_asm, mmm_avx512_12x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm}, + {mmm_avx512_13x8_asm, mmm_avx512_13x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm}, + {mmm_avx512_14x8_asm, mmm_avx512_14x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm}, + {mmm_avx512_15x8_asm, mmm_avx512_15x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm}, + {mmm_avx512_16x8_asm, mmm_avx512_16x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm}, + {mmm_avx512_17x8_asm, mmm_avx512_17x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm}, + {mmm_avx512_18x8_asm, mmm_avx512_18x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm}, + {mmm_avx512_19x8_asm, mmm_avx512_19x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm}, + {mmm_avx512_20x8_asm, mmm_avx512_20x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm}, + {mmm_avx512_21x8_asm, mmm_avx512_21x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm}, + {mmm_avx512_22x8_asm, mmm_avx512_22x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm}, + {mmm_avx512_23x8_asm, mmm_avx512_23x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm}, + {mmm_avx512_24x8_asm, mmm_avx512_24x16_asm, mmm_avx512_12x32_asm, mmm_avx512_8x48_asm}}; + U32 unrollNSizes[4] = {8, 16, 32, 48}; + U32 unrollMSizes[5] = {24, 24, 12, 8}; U32 alignedK = (K + 7) / 8 * 8; I32 *offsetC = (I32 *)(tmp); tmp += N * bytesOf(DT_I32); - UINT8 *tmpA = tmp; - tmp += M * alignedK * bytesOf(DT_U8_Q); - packB += N * bytesOf(DT_I32); - if (uintptr_t(tmp + N * bytesOf(DT_I32)) == uintptr_t(packB)) { // matmul - tmp += N * alignedK * bytesOf(DT_I8) + N * bytesOf(DT_I32); - } - U32 flags = 0; F32 *factorPtr = nullptr; F32 factor = 0; I32 *i32Result = (I32 *)result; UINT8 *u8Result = result; if (scale != nullptr) { - if (scale[0] < - 0) { // when use offline scale, the output datatype is U8_Q, you need more tmp buffer + if (scale[0] < 0) { + // when use offline scale, the output datatype is U8_Q, you need more tmp buffer flags |= 1 << 1; factor = scale[1]; i32Result = (I32 *)tmp; - memset(i32Result, 0, M * N * bytesOf(DT_I32)); + UNI_MEMSET(i32Result, 0, M * N * bytesOf(DT_I32)); tmp += M * N * bytesOf(DT_I32); } else { factor = 1 / (scale[0]); @@ -4875,194 +2163,124 @@ EE mmm_avx512_vnni_int8(U32 N, factorPtr = &factor; } - auto computeMNums = [=](U32 block, U32 unit) { - return block / unit + (block % unit >= (unit / 2)) + (block % (unit / 2)); + auto getEdgeMSize = [](U32 resM, U32 unrollM) { + U32 unit = unrollM / 2; + U32 low = unrollM / 4; + return (resM > 1) ? ((resM > low) ? ((resM + unit - 1) / unit * unit) : low) : resM; }; - - U32 mNum = M / BOLCK_M_DIM; - U32 unNum = N / UNROLL_N; - U32 unArrays[4] = {0}; - U32 umArrays[4] = {0}; - U32 umNums[4] = {0}; - U32 umResNums[4] = {0}; - U32 res = N % UNROLL_N; - unArrays[0] = UNROLL_N; - umArrays[0] = unrollMSize[(UNROLL_N >> 4) + 1]; - umNums[0] = computeMNums(BOLCK_M_DIM, umArrays[0]); - U32 idx = 1; - while (res > 0) { - unArrays[idx] = UNI_MIN(unrollNSizes[(res >> 4) + 1], res); - umArrays[idx] = unrollMSize[(res >> 4) + 1]; - umNums[idx] = computeMNums(BOLCK_M_DIM, umArrays[idx]); - if (unArrays[idx] < 8) { - umArrays[idx] = UNI_MIN(unrollMSize[0], BOLCK_M_DIM); - umNums[idx] = 1; - } - res -= unArrays[idx++]; + auto getMNum = [](U32 mDim, U32 unrollM) { return mDim / unrollM + ((mDim % unrollM) > 0); }; + + U32 resN = N % UNROLL_N; + U32 edgeNSize = (resN > 8) ? UNI_ALIGN(resN, 16) : 8; + U32 resM = M % UNROLL_M; + U32 mainEdgeMSize = getEdgeMSize(resM, UNROLL_M); + UINT8 *lastMainBlockA = packA + M / UNROLL_M * UNROLL_M * K; + if (resM < mainEdgeMSize && matrix1Df == DF_NORMAL) { // padding last block + UNI_MEMCPY(tmp, lastMainBlockA, resM * K); + UNI_MEMSET(tmp + resM * K, 128, (mainEdgeMSize - resM) * K); + lastMainBlockA = tmp; + tmp += mainEdgeMSize * K; } - U32 nLoopNum = unNum * umNums[0] + umNums[1] + umNums[2] + umNums[3]; - U32 mLoopNum = nLoopNum * mNum; - U32 nLoopResNum = 0; - if (M % BOLCK_M_DIM > 0) { - res = M % BOLCK_M_DIM; - for (U32 i = 0; i < 4 && umArrays[i] > 0; ++i) { - if (unArrays[i] < 8) { - umResNums[i] = 1; - } else { - umResNums[i] = computeMNums(res, umArrays[i]); - } - } - nLoopResNum = (unNum * umResNums[0] + umResNums[1] + umResNums[2] + umResNums[3]); + U32 mloopNum = getMNum(BOLCK_M_DIM, UNROLL_M) * (M / BOLCK_M_DIM) + + getMNum(M % BOLCK_M_DIM, UNROLL_M) * (M % BOLCK_M_DIM > 0); + + U32 newUnrollM = unrollMSizes[edgeNSize >> 4]; + resM = M % newUnrollM; + U32 resEdgeMSize = getEdgeMSize(resM, newUnrollM); + UINT8 *lastResBlockA = packA + M / newUnrollM * newUnrollM * K; + if (resM < resEdgeMSize && matrix1Df == DF_NORMAL) { // padding last block + UNI_MEMCPY(tmp, lastResBlockA, resM * K); + UNI_MEMSET(tmp + resM * K, 128, (resEdgeMSize - resM) * K); + lastResBlockA = tmp; + tmp += resEdgeMSize * K; } - idx = (unNum > 0) ? 0 : 1; - U32 umUnit = umArrays[idx]; - U32 firstLoopNum = (unArrays[idx] >= 8) ? computeMNums(M, umUnit) : 1; - U32 loopNum = mLoopNum + nLoopResNum - firstLoopNum; - if (unNum >= 1) { - unNum -= 1; - nLoopNum -= umNums[0]; - nLoopResNum -= umResNums[0]; - } else { - nLoopNum -= umNums[1]; - nLoopResNum -= umResNums[1]; + U32 resMloopNum = getMNum(BOLCK_M_DIM, newUnrollM) * (M / BOLCK_M_DIM) + + getMNum(M % BOLCK_M_DIM, newUnrollM) * (M % BOLCK_M_DIM > 0); + + U32 padM = UNI_MAX(UNI_ALIGN(M, UNROLL_M), UNI_ALIGN(M, newUnrollM)); + UINT8 *tmpK = tmp; + U32 resK = K % SIMDW; + if (resK > 0 && matrix1Df == DF_NORMAL) { + for (U32 i = 0; i < M; ++i) { + UNI_MEMCPY(tmpK + i * SIMDW, packA + (i + 1) * K - resK, resK); + UNI_MEMSET(tmpK + i * SIMDW + resK, 128, SIMDW - resK); + } + UNI_MEMSET(tmpK + M * SIMDW, 128, (padM - M) * SIMDW); + tmp += padM * SIMDW; } - mLoopNum = nLoopNum * mNum; + U32 mNNum = N / UNROLL_N; + U32 alginedN = mNNum * UNROLL_N + (resN > 0) * edgeNSize; + U32 nmask = pow(2, N % 16) - 1; + U32 loopNum = mNNum * mloopNum + (resN > 0) * resMloopNum; + U32 bmLoopNum = + mNNum * getMNum(BOLCK_M_DIM, UNROLL_M) + (resN > 0) * getMNum(BOLCK_M_DIM, newUnrollM); #ifdef _USE_OPENMP -#pragma omp parallel num_threads(OMP_NUM_THREADS) if (mLoopNum + nLoopResNum > OMP_NUM_THREADS) - { +#pragma omp parallel num_threads(OMP_NUM_THREADS) #endif + { U32 blockSizeK = 0; for (U32 k = 0; k < K; k += blockSizeK) { blockSizeK = UNI_MIN(BOLCK_K_DIM, K - k); - blockSizeK = UNI_MAX(blockSizeK % SIMDW, blockSizeK - blockSizeK % SIMDW); - U32 alignedBlockSizeK = align_size(blockSizeK, SIMDW); F32 *useFactor = nullptr; flags |= (k > 0); if (k == K - blockSizeK) { useFactor = factorPtr; } -#ifdef _USE_OPENMP -#pragma omp for schedule(static) -#endif - for (U32 l = 0; l < firstLoopNum; ++l) { - U32 umNum = M / umUnit; - U32 idxM = 2; - U32 m = 0; - U32 unrollSizeM = 0; - if (l < umNum) { - m = l * umUnit; - unrollSizeM = umUnit; - } else if (l == umNum) { - m = umNum * umUnit; - if ((M - umNum * umUnit) >= (umUnit / 2)) { - unrollSizeM = umUnit / 2; - idxM = 1; - } else { - unrollSizeM = 1; - idxM = 0; - } - } else { - if (M >= (umNum * umUnit + umUnit / 2)) { - m = umNum * umUnit + umUnit / 2 + (l - umNum - 1); - } else { - m = umNum * umUnit + (l - umNum); - } - unrollSizeM = 1; - idxM = 0; - } - - U32 stepK = K; - INT8 *curB = packB + k * N; - UINT8 *curA = packA + m * stepK + k; - if (matrix1Df == DF_TRANSPOSE) { - curA = tmpA + m * alignedBlockSizeK; - matrix2_trans_r(unrollSizeM, blockSizeK, M, SIMDW, matrix1 + m + k * M, curA); - stepK = alignedBlockSizeK; - } else if (matrix1Df == DF_NORMAL && blockSizeK < SIMDW) { - curA = tmpA + m * alignedBlockSizeK; - matrix1_trans_r(unrollSizeM, blockSizeK, K, SIMDW, matrix1 + k + m * K, curA); - stepK = alignedBlockSizeK; - } - kernel[idxM][(unArrays[idx] >> 4) + (unArrays[idx] >= 8)](unrollSizeM, - unArrays[idx], alignedBlockSizeK / 8, curA, curB, i32Result + m * N, - u8Result + m * N, offsetC, N, stepK, useFactor, flags); + U32 realK = blockSizeK; + U32 stepK = K; + if (matrix1Df == DF_TRANSPOSE) { + matrix2_trans_r(M, blockSizeK, M, SIMDW, packA, tmp); + realK = UNI_ALIGN(realK, SIMDW); + packA = tmp; + stepK = realK; } #ifdef _USE_OPENMP #pragma omp for schedule(static) #endif for (U32 l = 0; l < loopNum; ++l) { - U32 bm = l / nLoopNum * BOLCK_M_DIM; - U32 nLoop = l % nLoopNum; - U32 unrollSizeN = 0; - U32 blockSizeM = 0; - U32 unrollM = 0; - U32 m = 0, n = 0; - U32 *umNumsPtr; - if (l < mLoopNum) { - blockSizeM = BOLCK_M_DIM; - umNumsPtr = umNums; - } else { - blockSizeM = M % BOLCK_M_DIM; - umNumsPtr = umResNums; + U32 bm = l / bmLoopNum * BOLCK_M_DIM; + U32 blockSizeM = UNI_MIN(BOLCK_M_DIM, M - bm); + U32 mMNum = getMNum(blockSizeM, UNROLL_M); + U32 bn = l % bmLoopNum; + U32 nLoop = bn / mMNum; + U32 n = nLoop * UNROLL_N; + U32 mLoop = bn % mMNum; + U32 m = mLoop * UNROLL_M; + U32 edgeMSize = mainEdgeMSize; + U32 unrollM = UNROLL_M; + U32 mNum = mMNum; + U32 nSize = UNROLL_N; + UINT8 *lastBlockA = lastMainBlockA; + if (bn >= mNNum * mMNum) { + nLoop = mNNum; + n = mNNum * UNROLL_N; + mLoop = bn - mNNum * mMNum; + m = mLoop * newUnrollM; + edgeMSize = resEdgeMSize; + lastBlockA = lastResBlockA; + unrollM = newUnrollM; + mNum = getMNum(blockSizeM, newUnrollM); + nSize = edgeNSize; } - if (nLoop < unNum * umNumsPtr[0]) { - n = nLoop / umNumsPtr[0] * unArrays[0]; - m = nLoop % umNumsPtr[0]; - unrollSizeN = unArrays[0]; - unrollM = umArrays[0]; - } else { - n = unNum * unArrays[0]; - U32 x = unNum * umNumsPtr[0]; - for (int j = idx + 1; j < 4; x += umNumsPtr[j], n += unArrays[j], ++j) { - if (nLoop < x + umNumsPtr[j]) { - m = nLoop - x; - unrollSizeN = unArrays[j]; - unrollM = umArrays[j]; - break; - } - } + U32 um = (unrollM + m > blockSizeM) ? edgeMSize : unrollM; + U32 rm = UNI_MIN(unrollM, blockSizeM - m); + INT8 *curB = packB + k * alginedN + n * UNI_ALIGN(realK, SIMDW); + UINT8 *curA = packA + (m + bm) * stepK + k; + if ((mLoop == (mNum - 1)) && (M - bm <= BOLCK_M_DIM) && (resM < edgeMSize) && + (matrix1Df == DF_NORMAL)) { + curA = lastBlockA + k; } - - U32 unrollSizeM = 0; - U32 umNum = blockSizeM / unrollM; - U32 idxM = 2; - if (m < umNum) { - m = m * unrollM; - unrollSizeM = unrollM; - } else if (m == umNum) { - m = umNum * unrollM; - if ((blockSizeM - umNum * unrollM) >= (unrollM / 2)) { - unrollSizeM = unrollM / 2; - idxM = 1; - } else { - unrollSizeM = 1; - idxM = 0; - } - } else { - if (blockSizeM >= (umNum * unrollM + unrollM / 2)) { - m = umNum * unrollM + unrollM / 2 + (m - umNum - 1); - } else { - m = umNum * unrollM + (m - umNum); - } - unrollSizeM = 1; - idxM = 0; - } - - n += unArrays[idx]; - INT8 *curB = packB + k * N + n * alignedBlockSizeK; - UINT8 *curA = packA + (m + bm) * K + k; - kernel[idxM][(unrollSizeN >> 4) + (unrollSizeN >= 8)](unrollSizeM, unrollSizeN, - alignedBlockSizeK / 8, curA, curB, i32Result + (m + bm) * N + n, - u8Result + (m + bm) * N + n, offsetC + n, N, K, useFactor, flags); + UINT8 *kpad = tmpK + (m + bm) * SIMDW; + U32 tnmask = (nLoop == mNNum - 1 + (resN > 0)) ? nmask : 0; + kernel[rm - 1][nSize >> 4](um, nSize, realK, curA, curB, + i32Result + (m + bm) * N + n, u8Result + (m + bm) * N + n, offsetC + n, N, + stepK, useFactor, tnmask, kpad, flags); } } -#ifdef _USE_OPENMP } -#endif - return SUCCESS; } diff --git a/compute/blas_enhance/src/cpu/x86/int8/mvm_avx512_vnni.cpp b/compute/blas_enhance/src/cpu/x86/int8/mvm_avx512_vnni.cpp index 4ec2c1e3..b3fa74be 100644 --- a/compute/blas_enhance/src/cpu/x86/int8/mvm_avx512_vnni.cpp +++ b/compute/blas_enhance/src/cpu/x86/int8/mvm_avx512_vnni.cpp @@ -41,7 +41,7 @@ EE matrix_vector_multiply_transform_weight_int8( I32 *sumB = nullptr; if (!hasBias) { sumB = (I32 *)packB; - memset(sumB, 0, N * sizeof(I32)); + UNI_MEMSET(sumB, 0, N * sizeof(I32)); packB += N * bytesOf(DT_I32); } else { sumB = offsetCBias; @@ -49,13 +49,13 @@ EE matrix_vector_multiply_transform_weight_int8( U32 blockKSize = 0; for (U32 bk = 0; bk < K; bk += blockKSize) { blockKSize = UNI_MIN(K - bk, BOLCK_K_DIM); - U32 alignedBlockSizeK = align_size(blockKSize, 4); + U32 alignedBlockSizeK = UNI_ALIGN(blockKSize, 4); for (U32 un = 0; un < N; un += unrollSizeN) { unrollSizeN = UNI_MIN(UNROLL_N, N - un); unrollSizeN = unrollSize[unrollSizeN >> 4]; if (N - un < unrollSizeN) { unrollSizeN = N - un; - memset(packB, 0, unrollSizeN * alignedBlockSizeK); + UNI_MEMSET(packB, 0, unrollSizeN * alignedBlockSizeK); for (U32 k = 0; k < alignedBlockSizeK; k += 4) { for (U32 i = 0; i < unrollSizeN; ++i) { for (U32 ii = 0; ii < 4 && k + ii < blockKSize; ++ii) { @@ -65,7 +65,8 @@ EE matrix_vector_multiply_transform_weight_int8( } } } else { - matrix1_trans_l(unrollSizeN, blockKSize, K, 4, src + un * K + bk, packB); + matrix1_trans_l( + unrollSizeN, unrollSizeN, blockKSize, K, 4, src + un * K + bk, packB); } packB += unrollSizeN * alignedBlockSizeK; } @@ -84,7 +85,7 @@ EE matrix_vector_multiply_transform_weight_int8( I32 *sumB = nullptr; if (!hasBias) { sumB = (I32 *)packB; - memset(sumB, 0, N * sizeof(I32)); + UNI_MEMSET(sumB, 0, N * sizeof(I32)); packB += N * bytesOf(DT_I32); } else { sumB = offsetCBias; @@ -92,13 +93,13 @@ EE matrix_vector_multiply_transform_weight_int8( U32 blockKSize = 0; for (U32 bk = 0; bk < K; bk += blockKSize) { blockKSize = UNI_MIN(K - bk, BOLCK_K_DIM); - U32 alignedBlockSizeK = align_size(blockKSize, 4); + U32 alignedBlockSizeK = UNI_ALIGN(blockKSize, 4); for (U32 un = 0; un < N; un += unrollSizeN) { unrollSizeN = UNI_MIN(UNROLL_N, N - un); unrollSizeN = unrollSize[unrollSizeN >> 4]; if (N - un < unrollSizeN) { unrollSizeN = N - un; - memset(packB, 0, unrollSizeN * alignedBlockSizeK); + UNI_MEMSET(packB, 0, unrollSizeN * alignedBlockSizeK); for (U32 k = 0; k < blockKSize; k += 4) { for (U32 i = 0; i < unrollSizeN; ++i) { for (U32 ii = 0; ii < 4 && k + ii < blockKSize; ++ii) { @@ -108,7 +109,8 @@ EE matrix_vector_multiply_transform_weight_int8( } } } else { - matrix2_trans_l(unrollSizeN, blockKSize, N, 4, src + un + bk * N, packB); + matrix2_trans_l( + unrollSizeN, unrollSizeN, blockKSize, N, 4, src + un + bk * N, packB); } packB += unrollSizeN * alignedBlockSizeK; } @@ -680,6 +682,8 @@ void mvm_row_avx512_tail(U32 bn, I32 tmp = 0; if ((flags & 0x1) == 0) { tmp += offsetC[n]; + } else { + tmp = ((I32 *)result)[n]; } for (U32 k = 0; k < bk; k += 4) { for (U32 k4 = 0; k4 < 4; ++k4) { @@ -717,12 +721,12 @@ EE mvm_avx512_int8(U32 numRows, I32 *i32Result = (I32 *)result; UINT8 *u8Result = result; if (scale != nullptr) { - if (scale[0] < - 0) { // when use offline scale, the output datatype is U8_Q, you need more tmp buffer + // when use offline scale, the output datatype is U8_Q, you need more tmp buffer + if (scale[0] < 0) { flags |= 1 << 1; factor = scale[1]; i32Result = offsetCBias + numRows; - memset(i32Result, 0, numRows * bytesOf(DT_I32)); + UNI_MEMSET(i32Result, 0, numRows * bytesOf(DT_I32)); } else { factor = 1 / (*scale); } @@ -731,7 +735,7 @@ EE mvm_avx512_int8(U32 numRows, packB += numRows * bytesOf(DT_I32); for (U32 k = 0; k < numColumns; k += blockSizeK) { blockSizeK = UNI_MIN(BOLCK_K_DIM, numColumns - k); - U32 alignedBlockSizeK = align_size(blockSizeK, 4); + U32 alignedBlockSizeK = UNI_ALIGN(blockSizeK, 4); flags |= (k > 0); F32 *useFactor = nullptr; if (k == numColumns - blockSizeK) { diff --git a/compute/blas_enhance/src/cpu/x86/int8/mvm_avx512_vnni_row.cpp b/compute/blas_enhance/src/cpu/x86/int8/mvm_avx512_vnni_row.cpp index 5ad5b8bd..5030eea6 100644 --- a/compute/blas_enhance/src/cpu/x86/int8/mvm_avx512_vnni_row.cpp +++ b/compute/blas_enhance/src/cpu/x86/int8/mvm_avx512_vnni_row.cpp @@ -1000,7 +1000,7 @@ EE mvm_avx512_int8_row_i8u8(U32 numRows, flags |= 1 << 1; factor = scale[1]; i32Result = (I32 *)((UINT8 *)tmp + numRows * numColumns); - memset(i32Result, 0, numRows * bytesOf(DT_I32)); + UNI_MEMSET(i32Result, 0, numRows * bytesOf(DT_I32)); } else { factor = 1 / (*scale); } diff --git a/compute/blas_enhance/src/cpu/x86/mmm.cpp b/compute/blas_enhance/src/cpu/x86/mmm.cpp index d75353e8..62fe0455 100644 --- a/compute/blas_enhance/src/cpu/x86/mmm.cpp +++ b/compute/blas_enhance/src/cpu/x86/mmm.cpp @@ -23,7 +23,7 @@ #endif EE matrix_matrix_multiply_tmp_bytes_x86( - U32 matrixA_M, U32 matrixA_K, U32 matrixB_K, U32 matrixB_N, DataType dt, U32 *bytes) + U32 matrixA_M, U32 matrixA_K, U32 matrixB_K, U32 matrixB_N, DataFormat df, DataType dt, U32 *bytes) { EE ret = SUCCESS; switch (dt) { @@ -38,7 +38,7 @@ EE matrix_matrix_multiply_tmp_bytes_x86( case DT_U8_Q: case DT_I8: { matrix_matrix_multiply_tmp_bytes_int8( - matrixA_M, matrixA_K, matrixB_K, matrixB_N, dt, bytes); + matrixA_M, matrixA_K, matrixB_K, matrixB_N, df, dt, bytes); break; } #endif @@ -51,7 +51,7 @@ EE matrix_matrix_multiply_tmp_bytes_x86( } static EE matrix_matrix_multiply_transform_rhsN( - TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, void *offsetCBias) + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst) { EE ret = SUCCESS; switch (desc.dt) { @@ -64,7 +64,7 @@ static EE matrix_matrix_multiply_transform_rhsN( #ifdef _USE_INT8 case DT_I8: { ret = matrix_matrix_multiply_transform_rhsN_int8( - desc, (INT8 *)src, (INT8 *)dst, (I32 *)offsetCBias); + desc, (INT8 *)src, (INT8 *)dst); break; } #endif @@ -78,7 +78,7 @@ static EE matrix_matrix_multiply_transform_rhsN( } static EE matrix_matrix_multiply_transform_rhsT( - TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, void *offsetCBias) + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst) { EE ret = SUCCESS; switch (desc.dt) { @@ -91,7 +91,7 @@ static EE matrix_matrix_multiply_transform_rhsT( #ifdef _USE_INT8 case DT_I8: { ret = matrix_matrix_multiply_transform_rhsT_int8( - desc, (INT8 *)src, (INT8 *)dst, (I32 *)offsetCBias); + desc, (INT8 *)src, (INT8 *)dst); break; } #endif @@ -106,7 +106,7 @@ static EE matrix_matrix_multiply_transform_rhsT( } EE matrix_matrix_multiply_transform_rhs_x86( - TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, void *offsetCBias) + TensorDesc desc, const void *src, TensorDesc *descTran, void *dst) { if (desc.df == targetFormat4MatrixB(desc.dt)) { return SUCCESS; @@ -114,11 +114,11 @@ EE matrix_matrix_multiply_transform_rhs_x86( EE ret = SUCCESS; switch (desc.df) { case DF_NORMAL: { - ret = matrix_matrix_multiply_transform_rhsN(desc, src, descTran, dst, offsetCBias); + ret = matrix_matrix_multiply_transform_rhsN(desc, src, descTran, dst); break; } case DF_TRANSPOSE: { - ret = matrix_matrix_multiply_transform_rhsT(desc, src, descTran, dst, offsetCBias); + ret = matrix_matrix_multiply_transform_rhsT(desc, src, descTran, dst); break; } default: diff --git a/compute/blas_enhance/src/mmm.cpp b/compute/blas_enhance/src/mmm.cpp index 5d982700..871121cb 100644 --- a/compute/blas_enhance/src/mmm.cpp +++ b/compute/blas_enhance/src/mmm.cpp @@ -12,6 +12,7 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "blas_enhance.h" +#include "uni.h" #ifdef _USE_GENERAL #include "cpu/general/blas_general.h" #endif @@ -44,10 +45,10 @@ EE matrix_matrix_multiply_tmp_bytes( #ifdef _USE_X86 } else if (IS_X86(arch)) { ret = matrix_matrix_multiply_tmp_bytes_x86( - matrixA_M, matrixA_K, matrixB_K, matrixB_N, matrixADataType, bytes); + matrixA_M, matrixA_K, matrixB_K, matrixB_N, matrixADataFormat, matrixADataType, bytes); #endif #ifdef _USE_NEON - } else { + } else if (IS_ARM(arch)) { ret = matrix_matrix_multiply_tmp_bytes_arm( matrixA_M, matrixA_K, matrixB_K, matrixB_N, matrixADataType, bytes); #endif @@ -59,23 +60,21 @@ EE matrix_matrix_multiply_transform_rhs( TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, Arch arch) { EE ret = NOT_SUPPORTED; -#ifdef _USE_NEON if (IS_ARM(arch)) { +#ifdef _USE_NEON ret = matrix_matrix_multiply_transform_rhs_arm(desc, src, descTran, dst); - } #endif #ifdef _USE_GENERAL - if (IS_GENERAL(arch)) { - memcpy(dst, src, tensorNumBytes(desc)); + } else if (IS_GENERAL(arch)) { + UNI_MEMCPY(dst, src, tensorNumBytes(desc)); (*descTran) = desc; ret = SUCCESS; - } #endif #ifdef _USE_X86 - if (IS_X86(arch)) { - ret = matrix_matrix_multiply_transform_rhs_x86(desc, src, descTran, dst, nullptr); - } + } else if (IS_X86(arch)) { + ret = matrix_matrix_multiply_transform_rhs_x86(desc, src, descTran, dst); #endif + } return ret; } @@ -142,23 +141,23 @@ EE matrix_matrix_multiply(TensorDesc matrixADesc, TensorDesc tranDescB; U8 *dataB = (U8 *)matrixBData; if (matrixBDataFormat != targetFormat4MatrixB(matrixBDataType)) { - U8 *offsetCBias = nullptr; - U32 alignedAK = matrixA_K; dataB = ((U8 *)tmp); if (matrixADataType == DT_U8_Q && matrixBDataType == DT_I8) { - offsetCBias = (U8 *)tmp; - alignedAK = (matrixA_K + 7) / 8 * 8; - dataB += matrixC_N * bytesOf(DT_I32); + U32 alignedK = (matrixB_K + 7) / 8 * 8; + U32 alignedN = (matrixB_N + 15) / 16 * 16; + tmp = (U8 *)tmp + alignedK * alignedN; + } else { + U32 alignedN = (matrixB_N + 7) / 8 * 8; + tmp = (U8 *)tmp + matrixB_K * alignedN; } - dataB += matrixA_M * alignedAK * bytesOf(matrixADataType); ret = matrix_matrix_multiply_transform_rhs_x86( - matrixBDesc, matrixBData, &tranDescB, dataB, offsetCBias); + matrixBDesc, matrixBData, &tranDescB, dataB); } ret = mmm_x86(matrixC_N, matrixC_M, matrixA_K, matrixBDataType, matrixADataFormat, matrixAData, dataB, tmp, matrixCData, scale); #endif #ifdef _USE_NEON - } else { + } else if (IS_ARM(arch)) { TensorDesc tranDescB; U8 *dataB = (U8 *)matrixBData; if (matrixBDataFormat != targetFormat4MatrixB(matrixBDataType)) { diff --git a/compute/blas_enhance/src/mvm.cpp b/compute/blas_enhance/src/mvm.cpp index 359a7d58..501423d2 100644 --- a/compute/blas_enhance/src/mvm.cpp +++ b/compute/blas_enhance/src/mvm.cpp @@ -48,23 +48,21 @@ EE matrix_vector_multiply_transform_weight( TensorDesc desc, const void *src, TensorDesc *descTran, void *dst, Arch arch) { EE ret = NOT_SUPPORTED; -#ifdef _USE_NEON if (IS_ARM(arch)) { +#ifdef _USE_NEON ret = matrix_vector_multiply_transform_weight_arm(desc, src, descTran, dst); - } #endif #ifdef _USE_GENERAL - if (IS_GENERAL(arch)) { - memcpy(dst, src, tensorNumBytes(desc)); + } else if (IS_GENERAL(arch)) { + UNI_MEMCPY(dst, src, tensorNumBytes(desc)); (*descTran) = desc; ret = SUCCESS; - } #endif #ifdef _USE_X86 - if (IS_X86(arch)) { + } else if (IS_X86(arch)) { ret = matrix_vector_multiply_transform_weight_x86(desc, src, descTran, dst, nullptr); - } #endif + } return ret; } @@ -140,7 +138,7 @@ EE matrix_vector_multiply(TensorDesc matrixDesc, result, tmp, scale); #endif #ifdef _USE_NEON - } else { + } else if (IS_ARM(arch)) { ret = mvm_arm(matrixRow, matrixColumn, matrixDataType, matrixDataFormat, matrix, vector, tmp, result, arch); #endif diff --git a/compute/blas_enhance/tests/test_mmm.cpp b/compute/blas_enhance/tests/test_mmm.cpp index 80f3923e..b5de8983 100644 --- a/compute/blas_enhance/tests/test_mmm.cpp +++ b/compute/blas_enhance/tests/test_mmm.cpp @@ -31,7 +31,9 @@ int mmmTestKernel(U32 m, U32 k, U32 n, DataType dt) U32 bytes = 0; U8 *A = ut_input_v(m * k, dt, UT_INIT_RANDOM); U8 *B = ut_input_v(k * n, dt, UT_INIT_RANDOM); - U8 *B_tran = ut_input_v(k * n + 32, dt, UT_INIT_ZERO); + + U32 alignedN = (n + 7) / 8 * 8; + U8 *B_tran = ut_input_v(k * alignedN + 32, dt, UT_INIT_ZERO); U8 *C = ut_input_v(m * n, dt, UT_INIT_ZERO); U8 *C_ref = ut_input_v(m * n, dt, UT_INIT_ZERO); CHECK_STATUS(matrix_matrix_multiply_tmp_bytes(A_desc, B_desc, &bytes, UT_ARCH)); diff --git a/compute/blas_enhance/tests/test_mmm_int8.cpp b/compute/blas_enhance/tests/test_mmm_int8.cpp index 18a88be6..c7c72777 100644 --- a/compute/blas_enhance/tests/test_mmm_int8.cpp +++ b/compute/blas_enhance/tests/test_mmm_int8.cpp @@ -11,17 +11,16 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#include #include "blas_enhance.h" #include "ut_util.h" +#include "thread_affinity.h" -int main(int argc, char **argv) +//#define COVER_TEST + +int testMMM(U32 m, U32 k, U32 n) { #ifdef _USE_INT8 - CHECK_REQUIREMENT(argc == 4); - U32 m = atoi(argv[1]); - U32 k = atoi(argv[2]); - U32 n = atoi(argv[3]); - DataType dt = DT_I8; DataType odt = DT_I32; TensorDesc A_desc = tensor2df(dt, DF_NORMAL, m, k); @@ -31,17 +30,22 @@ int main(int argc, char **argv) U32 bytes = 0; U32 k8 = k; + U32 n8 = n; if (k8 % 8 != 0) { k8 = (k8 / 8) * 8 + 8; } + if (n8 % 16 != 0) { + n8 = (n8 / 16) * 16 + 16; + } INT8 *A = (INT8 *)ut_input_v(m * k, DT_I8, UT_INIT_RANDOM); INT8 *A_ref = (INT8 *)ut_input_v(m * k, DT_I8, UT_INIT_RANDOM); - memcpy(A_ref, A, m * k); + UNI_MEMCPY(A_ref, A, m * k); INT8 *B = (INT8 *)ut_input_v(k * n, DT_I8, UT_INIT_RANDOM); - INT8 *B_tran = (INT8 *)ut_input_v(k8 * n + 64 + n * 4, DT_I8, UT_INIT_ZERO); + INT8 *B_tran = (INT8 *)ut_input_v(k8 * n8 + 64 + n8 * 4, DT_I8, UT_INIT_ZERO); I32 *C = (I32 *)ut_input_v(m * n, DT_I32, UT_INIT_ZERO); I32 *C_ref = (I32 *)ut_input_v(m * n, DT_I32, UT_INIT_ZERO); CHECK_STATUS(matrix_matrix_multiply_tmp_bytes(A_desc, B_desc, &bytes, UT_ARCH)); + bytes += m * n; INT8 *tmp = (INT8 *)ut_input_v(bytes, DT_I8, UT_INIT_ZERO); matrix_matrix_multiply_transform_rhs(B_desc, B, &tranDescB, B_tran, UT_ARCH); @@ -51,15 +55,15 @@ int main(int argc, char **argv) for (U32 i = 0; i < m * k; ++i) { uA[i] = (UINT8)((I32)A[i] + 128); } - memcpy(tmp, B_tran, n * bytesOf(DT_I32)); + UNI_MEMCPY(tmp, B_tran + n8 * k8, n * bytesOf(DT_I32)); #endif if (UT_CHECK) { - CHECK_STATUS(matrix_matrix_multiply( - A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, nullptr, UT_ARCH)); + CHECK_STATUS( + matrix_matrix_multiply(A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, nullptr, UT_ARCH)); - CHECK_STATUS(matrix_matrix_multiply( - A_desc, A_ref, B_desc, B, bytes, tmp, C_desc, C_ref, nullptr, CPU_GENERAL)); + CHECK_STATUS( + matrix_matrix_multiply(A_desc, A_ref, B_desc, B, bytes, tmp, C_desc, C_ref, nullptr, CPU_GENERAL)); // check ut_check_v(C, C_ref, m * n, DT_I32, 1, __FILE__, __LINE__); @@ -68,8 +72,7 @@ int main(int argc, char **argv) // benchmark double time_start = ut_time_ms(); for (int iter = 0; iter < UT_LOOPS; iter++) { - matrix_matrix_multiply( - A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, nullptr, UT_ARCH); + matrix_matrix_multiply(A_desc, A, tranDescB, B_tran, bytes, tmp, C_desc, C, nullptr, UT_ARCH); } double time_end = ut_time_ms(); double time = (time_end - time_start) / UT_LOOPS; @@ -91,3 +94,21 @@ int main(int argc, char **argv) #endif return 0; } + +int main(int argc, char **argv) +{ +#ifdef COVER_TEST + int ret = 0; + for (U32 m = 1; m < 48; ++m) { + for (U32 k = 1; k < 48; ++k) { + for (U32 n = 1; n < 48; ++n) { + ret = testMMM(m, k, n); + } + } + } + return ret; +#else + CHECK_REQUIREMENT(argc == 4); + return testMMM(atoi(argv[1]), atoi(argv[2]), atoi(argv[3])); +#endif +} diff --git a/compute/blas_enhance/tests/test_mvm_int8.cpp b/compute/blas_enhance/tests/test_mvm_int8.cpp index 3165c956..1af973d4 100644 --- a/compute/blas_enhance/tests/test_mvm_int8.cpp +++ b/compute/blas_enhance/tests/test_mvm_int8.cpp @@ -47,7 +47,7 @@ int main(int argc, char **argv) INT8 *matTran = (INT8 *)ut_input_v(m * k4 + m * 4, DT_I8, UT_INIT_ZERO); INT8 *vec = (INT8 *)ut_input_v(vc, DT_I8, UT_INIT_RANDOM); INT8 *vec_ref = (INT8 *)ut_input_v(vc, DT_I8, UT_INIT_RANDOM); - memcpy(vec_ref, vec, vc); + UNI_MEMCPY(vec_ref, vec, vc); I32 *res = (I32 *)ut_input_v(rc, DT_I32, UT_INIT_ZERO); I32 *res_ref = (I32 *)ut_input_v(rc, DT_I32, UT_INIT_ZERO); @@ -62,7 +62,7 @@ int main(int argc, char **argv) for (U32 i = 0; i < vc; ++i) { uA[i] = (UINT8)((I32)vec[i] + 128); } - memcpy(tmp, matTran, rc * bytesOf(DT_I32)); + UNI_MEMCPY(tmp, matTran, rc * bytesOf(DT_I32)); #endif // check diff --git a/compute/image/include/image.h b/compute/image/include/image.h index 13202f80..516707c5 100644 --- a/compute/image/include/image.h +++ b/compute/image/include/image.h @@ -23,13 +23,29 @@ #include "ocl_desc_trans.h" #endif -EE resize_infer_output_size(Tensor *inputTensor, - DataType paramDT, - void *params, - Tensor *outputTensor, - U32 *outputBytes, - ArchInfo_t archInfo); +EE resize_infer_output_size( + Tensor *inputTensor, ResizeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); + +EE resize_infer_forward_tmp_bytes( + Tensor inputTensor, ResizeParamSpec p, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo); EE resize( - Tensor inputTensor, Tensor tmpTensor, Tensor outputTensor, ResizeParamSpec p, ArchInfo_t archInfo); + Tensor inputTensor, ResizeParamSpec p, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo); + +EE grid_sample_infer_output_size( + Tensor *inputTensor, Tensor *gridTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE grid_sample_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor gridTensor, + GridSampleParamSpec p, + Tensor outputTensor, + U32 *bytes, + ArchInfo_t archInfo); + +EE grid_sample(Tensor inputTensor, + Tensor gridTensor, + GridSampleParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); #endif diff --git a/compute/image/src/CMakeLists.txt b/compute/image/src/CMakeLists.txt index 7c2da109..32d08e45 100644 --- a/compute/image/src/CMakeLists.txt +++ b/compute/image/src/CMakeLists.txt @@ -26,6 +26,9 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # shared library add_library(${PROJECT_NAME} SHARED ${srcs}) target_link_libraries (${PROJECT_NAME} LINK_PUBLIC uni) +if (USE_SECURE_C) + target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${SecureC_SHARED_LIBRARY}) +endif () # static library add_library(${PROJECT_NAME}_static STATIC ${srcs}) diff --git a/compute/image/src/cpu/arm/image_arm.h b/compute/image/src/cpu/arm/image_arm.h index cfbe7f19..5374fd18 100644 --- a/compute/image/src/cpu/arm/image_arm.h +++ b/compute/image/src/cpu/arm/image_arm.h @@ -16,6 +16,7 @@ #include "error.h" #include "tensor_desc.h" +#include "parameter_spec.h" -EE resize_bilinear_arm(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output); +EE resize_bilinear_arm(TensorDesc inputDesc, void *input, ResizeParamSpec p, TensorDesc outputDesc, void *output); #endif diff --git a/compute/image/src/cpu/arm/resize_bilinear.cpp b/compute/image/src/cpu/arm/resize_bilinear.cpp index 1dbf6722..4b87c17c 100644 --- a/compute/image/src/cpu/arm/resize_bilinear.cpp +++ b/compute/image/src/cpu/arm/resize_bilinear.cpp @@ -16,7 +16,8 @@ #include "uni.h" #ifdef _USE_FP16 -EE resize_bilinear_fp16(TensorDesc inputDesc, F16 *inArray, TensorDesc outputDesc, F16 *outArray) +EE resize_bilinear_fp16( + TensorDesc inputDesc, F16 *inArray, ResizeParamSpec p, TensorDesc outputDesc, F16 *outArray) { DataType idt, odt; DataFormat idf, odf; @@ -28,8 +29,14 @@ EE resize_bilinear_fp16(TensorDesc inputDesc, F16 *inArray, TensorDesc outputDes if (idf != DF_NCHWC8 || odf != DF_NCHWC8) { CHECK_STATUS(NOT_MATCH); } - F32 strideH = ((F32)ih) / oh; - F32 strideW = ((F32)iw) / ow; + F32 strideH, strideW; + if (p.trans_mode == COORDINATE_TRANS_ALIGN_CORNERS) { + strideH = ((F32)ih - 1) / (oh - 1); + strideW = ((F32)iw - 1) / (ow - 1); + } else { + strideH = ((F32)ih) / oh; + strideW = ((F32)iw) / ow; + } U32 ic_align = 8, oc_align = 8; ic /= ic_align; oc /= oc_align; @@ -78,7 +85,8 @@ EE resize_bilinear_fp16(TensorDesc inputDesc, F16 *inArray, TensorDesc outputDes #endif #ifdef _USE_FP32 -EE resize_bilinear_fp32(TensorDesc inputDesc, F32 *inArray, TensorDesc outputDesc, F32 *outArray) +EE resize_bilinear_fp32( + TensorDesc inputDesc, F32 *inArray, ResizeParamSpec p, TensorDesc outputDesc, F32 *outArray) { DataType idt, odt; DataFormat idf, odf; @@ -90,8 +98,14 @@ EE resize_bilinear_fp32(TensorDesc inputDesc, F32 *inArray, TensorDesc outputDes if (idf != DF_NCHWC8 || odf != DF_NCHWC8) { CHECK_STATUS(NOT_MATCH); } - F32 strideH = ((F32)ih) / oh; - F32 strideW = ((F32)iw) / ow; + F32 strideH, strideW; + if (p.trans_mode == COORDINATE_TRANS_ALIGN_CORNERS) { + strideH = ((F32)ih - 1) / (oh - 1); + strideW = ((F32)iw - 1) / (ow - 1); + } else { + strideH = ((F32)ih) / oh; + strideW = ((F32)iw) / ow; + } U32 ic_align = 8, oc_align = 8; ic /= ic_align; oc /= oc_align; @@ -148,18 +162,19 @@ EE resize_bilinear_fp32(TensorDesc inputDesc, F32 *inArray, TensorDesc outputDes } #endif -EE resize_bilinear_arm(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output) +EE resize_bilinear_arm( + TensorDesc inputDesc, void *input, ResizeParamSpec p, TensorDesc outputDesc, void *output) { EE ret = SUCCESS; switch (inputDesc.dt) { #ifdef _USE_FP16 case DT_F16: - ret = resize_bilinear_fp16(inputDesc, (F16 *)input, outputDesc, (F16 *)output); + ret = resize_bilinear_fp16(inputDesc, (F16 *)input, p, outputDesc, (F16 *)output); break; #endif #ifdef _USE_FP32 case DT_F32: - ret = resize_bilinear_fp32(inputDesc, (F32 *)input, outputDesc, (F32 *)output); + ret = resize_bilinear_fp32(inputDesc, (F32 *)input, p, outputDesc, (F32 *)output); break; #endif default: diff --git a/compute/image/src/cpu/general/image_general.h b/compute/image/src/cpu/general/image_general.h index a44a19bc..cb5e98a3 100644 --- a/compute/image/src/cpu/general/image_general.h +++ b/compute/image/src/cpu/general/image_general.h @@ -15,6 +15,7 @@ #define _H_IMAGE_GENERAL #include "tensor_desc.h" +#include "parameter_spec.h" -EE resize_bilinear_general(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output); +EE resize_bilinear_general(TensorDesc inputDesc, void *input, ResizeParamSpec p, TensorDesc outputDesc, void *output); #endif diff --git a/compute/image/src/cpu/general/resize_bilinear.cpp b/compute/image/src/cpu/general/resize_bilinear.cpp index 09705e3c..cc38e0ed 100644 --- a/compute/image/src/cpu/general/resize_bilinear.cpp +++ b/compute/image/src/cpu/general/resize_bilinear.cpp @@ -15,7 +15,8 @@ #include "uni.h" template -EE resize_bilinear(TensorDesc inputDesc, IT *inArray, TensorDesc outputDesc, OT *outArray) +EE resize_bilinear( + TensorDesc inputDesc, IT *inArray, ResizeParamSpec p, TensorDesc outputDesc, OT *outArray) { DataType idt, odt; DataFormat idf, odf; @@ -32,8 +33,14 @@ EE resize_bilinear(TensorDesc inputDesc, IT *inArray, TensorDesc outputDesc, OT oc_align = 8; } - F32 strideH = ((F32)ih) / oh; - F32 strideW = ((F32)iw) / ow; + F32 strideH, strideW; + if (p.trans_mode == COORDINATE_TRANS_ALIGN_CORNERS) { + strideH = ((F32)ih - 1) / (oh - 1); + strideW = ((F32)iw - 1) / (ow - 1); + } else { + strideH = ((F32)ih) / oh; + strideW = ((F32)iw) / ow; + } ic /= ic_align; oc /= oc_align; U32 srcTL, srcTR, srcBL, srcBR; @@ -75,6 +82,7 @@ EE resize_bilinear(TensorDesc inputDesc, IT *inArray, TensorDesc outputDesc, OT srcBL = ((n * ic + cc) * ih + hBB) * iw + wL; srcBR = ((n * ic + cc) * ih + hBB) * iw + wRR; } + outArray[dst] = inArray[srcTL] * factorTL + inArray[srcTR] * factorTR + inArray[srcBL] * factorBL + inArray[srcBR] * factorBR; } @@ -85,33 +93,34 @@ EE resize_bilinear(TensorDesc inputDesc, IT *inArray, TensorDesc outputDesc, OT return SUCCESS; } -EE resize_bilinear_general(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output) +EE resize_bilinear_general( + TensorDesc inputDesc, void *input, ResizeParamSpec p, TensorDesc outputDesc, void *output) { EE ret = NOT_SUPPORTED; switch (inputDesc.dt) { #ifdef _USE_FP16 case DT_F16: { - ret = resize_bilinear(inputDesc, (F16 *)input, outputDesc, (F16 *)output); + ret = resize_bilinear(inputDesc, (F16 *)input, p, outputDesc, (F16 *)output); break; } #endif #ifdef _USE_FP32 case DT_F32: { - ret = resize_bilinear(inputDesc, (F32 *)input, outputDesc, (F32 *)output); + ret = resize_bilinear(inputDesc, (F32 *)input, p, outputDesc, (F32 *)output); break; } #endif case DT_U8: { + if (0) { #ifdef _USE_FP16 - if (DT_F16 == outputDesc.dt) { - ret = resize_bilinear(inputDesc, (U8 *)input, outputDesc, (F16 *)output); - } + } else if (DT_F16 == outputDesc.dt) { + ret = resize_bilinear(inputDesc, (U8 *)input, p, outputDesc, (F16 *)output); #endif #ifdef _USE_FP32 - if (DT_F32 == outputDesc.dt) { - ret = resize_bilinear(inputDesc, (U8 *)input, outputDesc, (F32 *)output); - } + } else if (DT_F32 == outputDesc.dt) { + ret = resize_bilinear(inputDesc, (U8 *)input, p, outputDesc, (F32 *)output); #endif + } break; } default: diff --git a/compute/image/src/cpu/grid_sample.cpp b/compute/image/src/cpu/grid_sample.cpp new file mode 100644 index 00000000..c53fabb3 --- /dev/null +++ b/compute/image/src/cpu/grid_sample.cpp @@ -0,0 +1,246 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/image_cpu.h" +#include "uni.h" + +EE grid_sample_infer_output_size_cpu( + TensorDesc inputDesc, TensorDesc gridDesc, TensorDesc *outputDesc) +{ + *outputDesc = inputDesc; + outputDesc->dims[0] = gridDesc.dims[1]; + outputDesc->dims[1] = gridDesc.dims[2]; + CHECK_REQUIREMENT(gridDesc.dims[0] == inputDesc.nDims - 2); + return SUCCESS; +} + +static inline float denormalize(float n, int length, bool align_corners) +{ + float x; + if (align_corners) { + x = (n + 1) / 2. * (length - 1); + } else { + x = ((n + 1) * length - 1) / 2.; + } + return x; +} + +static inline float border(float x, float x_min, float x_max) +{ + return UNI_MIN(UNI_MAX(x, x_min), x_max); +} + +static inline float reflect(float x, float x_min, float x_max) +{ + float range = x_max - x_min; + if (x < x_min) { + float dx = x_min - x; + int n = dx / range; + float r = dx - n * range; + if (n % 2 == 0) { + x = x_min + r; + } else { + x = x_max - r; + } + } else if (x > x_max) { + float dx = x - x_max; + int n = dx / range; + float r = dx - n * range; + if (n % 2 == 0) { + x = x_max - r; + } else { + x = x_min + r; + } + } + return x; +} + +template +static inline float get( + const T *image, int it, int ih, int iw, int t, int h, int w, int cAlign, PadMode mode, float *bound) +{ + float pixel; + if (mode == PAD_CONSTANT) { + if (t >= 0 && t < it && h >= 0 && h < ih && w >= 0 && w < iw) { + pixel = image[(((t * ih) + h) * iw + w) * cAlign]; + } else { + pixel = 0; + } + } else if (mode == PAD_EDGE) { + w = border(w, 0, iw - 1); + h = border(h, 0, ih - 1); + //t = border(t, 0, it - 1); + pixel = image[(((t * ih) + h) * iw + w) * cAlign]; + } else { + w = reflect(w, bound[0], bound[1]); + h = reflect(h, bound[2], bound[3]); + //t = reflect(t, bound[4], bound[5]); + pixel = image[(((t * ih) + h) * iw + w) * cAlign]; + } + return pixel; +} + +template +static EE grid_sample_kernel(TensorDesc inputDesc, + T *input, + TensorDesc gridDesc, + T *grid, + GridSampleParamSpec p, + T *tmp, + TensorDesc outputDesc, + T *output) +{ + DataType idt; + DataFormat idf; + U32 in, ic, it, ih, iw; + if (tensorIs3d(inputDesc)) { + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ic, &iw)); + it = ih = 1; + } else if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + it = 1; + } else if (tensorIs5d(inputDesc)) { + CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw)); + } else { + return NOT_SUPPORTED; + } + int olen = tensorNumElements(outputDesc) / in / ic; + int S = tensorNumElements(gridDesc) / in / olen; + int cAlign = 1; + if (idf == DF_NCHWC8) { + cAlign = 8; + } + ic /= cAlign; + + float w_min = -0.5; + float w_max = iw - 0.5; + float h_min = -0.5; + float h_max = ih - 0.5; + float t_min = -0.5; + float t_max = it - 0.5; + if (p.align_corners) { + w_min = -0.5; + w_max = iw - 0.5; + h_min = -0.5; + h_max = ih - 0.5; + t_min = -0.5; + t_max = it - 0.5; + } + float bound[6] = {w_min, w_max, h_min, h_max, t_min, t_max}; + EE ret = SUCCESS; +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 o = 0; o < in * ic; o++) { + U32 n = o / ic; + U32 c = o % ic; + float x, y, z; + for (int i = 0; i < olen; i++) { + T *g = grid + (n * olen + i) * S; + for (int c8 = 0; c8 < cAlign; c8++) { + T *data = input + o * it * ih * iw * cAlign + c8; + T *out = output + (o * olen + i) * cAlign + c8; + x = denormalize(g[0], iw, p.align_corners); + if (S > 1) { + y = denormalize(g[1], ih, p.align_corners); + } else { + y = 0; + } + if (S > 2) { + z = denormalize(g[2], it, p.align_corners); + } else { + z = 0; + } + //switch (p.pad_mode) { + // case PAD_EDGE: { + // x = border(x, 0, iw - 1); + // y = border(y, 0, ih - 1); + // z = border(z, 0, it - 1); + // break; + // } + // case PAD_REFLECT: { + // x = reflect(x, w_min, w_max); + // y = reflect(y, h_min, h_max); + // z = reflect(z, t_min, t_max); + // break; + // } + // default: + // break; + //} + switch (p.mode) { + case RESIZE_NEAREST: { + x = round(x); + y = round(y); + z = round(z); + *out = get(data, it, ih, iw, z, y, x, cAlign, p.pad_mode, bound); + break; + } + case RESIZE_LINEAR: { + int x1 = floor(x); + int x2 = x1 + 1; + int y1 = floor(y); + int y2 = y1 + 1; + //int z1 = floor(z); + //int z2 = z1 + 1; + float p11 = get(data, it, ih, iw, 0, y1, x1, cAlign, p.pad_mode, bound); + float p12 = get(data, it, ih, iw, 0, y1, x2, cAlign, p.pad_mode, bound); + float p21 = get(data, it, ih, iw, 0, y2, x1, cAlign, p.pad_mode, bound); + float p22 = get(data, it, ih, iw, 0, y2, x2, cAlign, p.pad_mode, bound); + float dx2 = x2 - x; + float dx1 = x - x1; + float dy2 = y2 - y; + float dy1 = y - y1; + *out = dy2 * (dx2 * p11 + dx1 * p12) + dy1 * (dx2 * p21 + dx1 * p22); + break; + } + default: + UNI_ERROR_LOG("GridSample currently not support this mode.\n"); + ret = NOT_SUPPORTED; + break; + } + } + } + } + return ret; +} + +EE grid_sample_cpu(TensorDesc inputDesc, + void *input, + TensorDesc gridDesc, + void *grid, + GridSampleParamSpec p, + void *tmp, + TensorDesc outputDesc, + void *output) +{ + EE ret = NOT_SUPPORTED; + switch (inputDesc.dt) { +#ifdef _USE_FP16 + case DT_F16: { + ret = grid_sample_kernel(inputDesc, (F16 *)input, gridDesc, (F16 *)grid, p, + (F16 *)tmp, outputDesc, (F16 *)output); + break; + } +#endif +#ifdef _USE_FP32 + case DT_F32: { + ret = grid_sample_kernel(inputDesc, (F32 *)input, gridDesc, (F32 *)grid, p, + (F32 *)tmp, outputDesc, (F32 *)output); + break; + } +#endif + default: + break; + } + return ret; +} diff --git a/compute/image/src/cpu/image_cpu.h b/compute/image/src/cpu/image_cpu.h index 61030f4d..5049c101 100644 --- a/compute/image/src/cpu/image_cpu.h +++ b/compute/image/src/cpu/image_cpu.h @@ -19,4 +19,16 @@ EE resize_nearest_cpu( TensorDesc inputDesc, void *input, ResizeParamSpec p, TensorDesc outputDesc, void *output); + +EE grid_sample_infer_output_size_cpu( + TensorDesc inputDesc, TensorDesc gridDesc, TensorDesc *outputDesc); + +EE grid_sample_cpu(TensorDesc inputDesc, + void *input, + TensorDesc gridDesc, + void *grid, + GridSampleParamSpec p, + void *tmp, + TensorDesc outputDesc, + void *output); #endif diff --git a/compute/image/src/cpu/resize_nearest.cpp b/compute/image/src/cpu/resize_nearest.cpp index 0601e0e5..b4c1c776 100644 --- a/compute/image/src/cpu/resize_nearest.cpp +++ b/compute/image/src/cpu/resize_nearest.cpp @@ -12,8 +12,36 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "cpu/image_cpu.h" +#include "affinity_policy.h" -template +template +inline static int round_d(float x) +{ + int ret = 0; + switch (round_mode) { + case ROUND_FLOOR: + ret = floor(x); + break; + case ROUND_CEIL: + ret = ceil(x); + break; + case ROUND_PREFER_FLOOR: + ret = round(x); + if (ret - x == 0.5) { + ret -= 1; + } + break; + case ROUND_PREFER_CEIL: + ret = round(x); + break; + default: + UNI_ERROR_LOG("Resize currently not support this round mode.\n"); + break; + } + return ret; +} + +template inline static EE resize_nearest_kernel( const TensorDesc &inputDesc, IT *inArray, const TensorDesc &outputDesc, OT *outArray) { @@ -38,61 +66,64 @@ inline static EE resize_nearest_kernel( float ws0 = iw * 1.0 / ow; float hs1 = (ih - 1.0) / (oh - 1.0); float ws1 = (iw - 1.0) / (ow - 1.0); - - int srcX, srcY, src; - for (U32 n = 0, dst = 0; n < on; n++) { - for (I32 c = 0; c < oc_d; c++) { - for (U32 h = 0; h < oh; h++) { - for (U32 w = 0; w < ow; w++) { - for (int k = 0; k < oc_align; k++, dst++) { - switch (coordinate_transformation_mode) { - case HALF_PIXEL: { - srcX = (h + 0.5) * hs0 - 0.5; - srcY = (w + 0.5) * ws0 - 0.5; - if (srcX < 0) { - srcX = 0; - } - if (srcY < 0) { - srcY = 0; - } - break; +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 o = 0; o < on * oc_d; o++) { + int n = o / oc_d; + int c = o % oc_d; + int dst = o * oh * ow * oc_align; + int srcX, srcY, src; + for (U32 h = 0; h < oh; h++) { + for (U32 w = 0; w < ow; w++) { + for (int k = 0; k < oc_align; k++, dst++) { + switch (coordinate_transformation_mode) { + case COORDINATE_TRANS_HALF_PIXEL: { + srcX = round_d((h + 0.5) * hs0 - 0.5); + srcY = round_d((w + 0.5) * ws0 - 0.5); + if (srcX < 0) { + srcX = 0; } - case PYTORCH_HALF_PIXEL: { - srcX = oh > 1 ? (h + 0.5) * hs0 - 0.5 : 0; - srcY = ow > 1 ? (w + 0.5) * ws0 - 0.5 : 0; - if (srcX < 0) { - srcX = 0; - } - if (srcY < 0) { - srcY = 0; - } - break; + if (srcY < 0) { + srcY = 0; } - case ALIGN_CORNERS: { - srcX = h * hs1; - srcY = w * ws1; - break; + break; + } + case COORDINATE_TRANS_PYTORCH_HALF_PIXEL: { + srcX = oh > 1 ? round_d((h + 0.5) * hs0 - 0.5) : 0; + srcY = ow > 1 ? round_d((w + 0.5) * ws0 - 0.5) : 0; + if (srcX < 0) { + srcX = 0; } - case ASYMMETRIC: { - srcX = h * hs0; - srcY = w * ws0; - break; + if (srcY < 0) { + srcY = 0; } - default: - UNI_ERROR_LOG("Resize currently not support this coordinate " - "transformation mode.\n"); - break; + break; + } + case COORDINATE_TRANS_ALIGN_CORNERS: { + srcX = round_d(h * hs1); + srcY = round_d(w * ws1); + break; } - U32 cc = c * oc_align + k; - if (idf == DF_NCHWC8) { - U32 cc1 = cc / ic_align; - U32 cc2 = cc % ic_align; - src = (((n * ic_d + cc1) * ih + srcX) * iw + srcY) * ic_align + cc2; - } else { - src = ((n * ic + cc) * ih + srcX) * iw + srcY; + case COORDINATE_TRANS_ASYMMETRIC: { + srcX = round_d(h * hs0); + srcY = round_d(w * ws0); + break; } - outArray[dst] = (OT)inArray[src]; + default: + UNI_ERROR_LOG("Resize currently not support this coordinate " + "transformation mode.\n"); + break; } + U32 cc = c * oc_align + k; + if (idf == DF_NCHWC8) { + U32 cc1 = cc / ic_align; + U32 cc2 = cc % ic_align; + src = (((n * ic_d + cc1) * ih + srcX) * iw + srcY) * ic_align + cc2; + } else { + src = ((n * ic + cc) * ih + srcX) * iw + srcY; + } + outArray[dst] = (OT)inArray[src]; } } } @@ -100,6 +131,43 @@ inline static EE resize_nearest_kernel( return SUCCESS; } +template +inline static EE resize_nearest_kernel(const TensorDesc &inputDesc, + IT *inArray, + ResizeParamSpec p, + const TensorDesc &outputDesc, + OT *outArray) +{ + EE ret = SUCCESS; + switch (p.round_mode) { + case ROUND_CEIL: { + resize_nearest_kernel( + inputDesc, inArray, outputDesc, outArray); + break; + } + case ROUND_FLOOR: { + resize_nearest_kernel( + inputDesc, inArray, outputDesc, outArray); + break; + } + case ROUND_PREFER_CEIL: { + resize_nearest_kernel( + inputDesc, inArray, outputDesc, outArray); + break; + } + case ROUND_PREFER_FLOOR: { + resize_nearest_kernel( + inputDesc, inArray, outputDesc, outArray); + break; + } + default: + UNI_ERROR_LOG("Resize currently not support this round mode.\n"); + ret = NOT_SUPPORTED; + break; + } + return ret; +} + template inline static EE resize_nearest_wrapper(const TensorDesc &inputDesc, IT *inArray, @@ -109,21 +177,24 @@ inline static EE resize_nearest_wrapper(const TensorDesc &inputDesc, { EE ret = SUCCESS; switch (p.trans_mode) { - case HALF_PIXEL: { - resize_nearest_kernel(inputDesc, inArray, outputDesc, outArray); + case COORDINATE_TRANS_HALF_PIXEL: { + resize_nearest_kernel( + inputDesc, inArray, p, outputDesc, outArray); break; } - case PYTORCH_HALF_PIXEL: { - resize_nearest_kernel( - inputDesc, inArray, outputDesc, outArray); + case COORDINATE_TRANS_PYTORCH_HALF_PIXEL: { + resize_nearest_kernel( + inputDesc, inArray, p, outputDesc, outArray); break; } - case ALIGN_CORNERS: { - resize_nearest_kernel(inputDesc, inArray, outputDesc, outArray); + case COORDINATE_TRANS_ALIGN_CORNERS: { + resize_nearest_kernel( + inputDesc, inArray, p, outputDesc, outArray); break; } - case ASYMMETRIC: { - resize_nearest_kernel(inputDesc, inArray, outputDesc, outArray); + case COORDINATE_TRANS_ASYMMETRIC: { + resize_nearest_kernel( + inputDesc, inArray, p, outputDesc, outArray); break; } default: diff --git a/compute/image/src/cpu/x86/image_x86.h b/compute/image/src/cpu/x86/image_x86.h index b6396d35..0fcefe7f 100644 --- a/compute/image/src/cpu/x86/image_x86.h +++ b/compute/image/src/cpu/x86/image_x86.h @@ -20,8 +20,8 @@ EE resize_bilinear_x86(TensorDesc inputDesc, void *input, - TensorDesc outputDesc, + ResizeParamSpec p, void *tmp, - void *output, - ResizeParamSpec p); + TensorDesc outputDesc, + void *output); #endif diff --git a/compute/image/src/cpu/x86/resize_bilinear.cpp b/compute/image/src/cpu/x86/resize_bilinear.cpp index 43e2c2ba..0a391cfe 100644 --- a/compute/image/src/cpu/x86/resize_bilinear.cpp +++ b/compute/image/src/cpu/x86/resize_bilinear.cpp @@ -46,18 +46,22 @@ typedef void (*compute_bilinear_func)(F32 *input0, U32 onStep, U32 on); -inline F32 infer_src(I32 x, I32 iw, I32 ow, ResizeCoordinateTransMode trans_mode) +inline F32 infer_src(I32 x, I32 iw, I32 ow, CoordinateTransMode trans_mode) { + F32 scale = 1.0 * iw / ow; F32 ret; switch (trans_mode) { - case HALF_PIXEL: - ret = (x + 0.5f) * 1.0f * iw / ow - 0.5; + case COORDINATE_TRANS_HALF_PIXEL: + ret = (x + 0.5f) * scale - 0.5; break; - case ALIGN_CORNERS: + case COORDINATE_TRANS_ALIGN_CORNERS: ret = x * 1.0f * (iw - 1) / (ow - 1); break; - case PYTORCH_HALF_PIXEL: - ret = (ow > 1) ? ((x + 0.5f) * 1.0f * iw / ow - 0.5) : 0; + case COORDINATE_TRANS_PYTORCH_HALF_PIXEL: + ret = (ow > 1) ? ((x + 0.5f) * scale - 0.5) : 0; + break; + case COORDINATE_TRANS_ASYMMETRIC: + ret = x * scale; break; default: ret = 0; @@ -281,8 +285,59 @@ inline void compute_bilinear_nchw_fp32(F32 *input0, } } +EE resize_bilinear_x86_fp32_nchw( + TensorDesc inputDesc, F32 *input, ResizeParamSpec p, F32 *tmp, TensorDesc outputDesc, F32 *output) +{ + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + CHECK_REQUIREMENT(odf == DF_NCHW || idf == DF_NCHW); + + for (U32 c = 0; c < oc; ++c) { + F32 *outp = output + c * oh * ow; + F32 *inp = input + c * ih * iw; + + for (U32 h = 0; h < oh; ++h) { + F32 hC = infer_src(h, ih, oh, p.trans_mode); + hC = UNI_MIN(ih - 1, UNI_MAX(0, hC)); + I32 hT = floor(hC); + I32 hB = ceil(hC); + F32 h1 = hB - hC; + F32 h2 = hC - hT; + + for (U32 w = 0; w < ow; ++w) { + F32 wC = infer_src(w, iw, ow, p.trans_mode); + wC = UNI_MIN(iw - 1, UNI_MAX(0, wC)); + I32 wL = floor(wC); + I32 wR = ceil(wC); + F32 w1 = wR - wC; + F32 w2 = wC - wL; + + U32 output_idx = h * ow + w; + if (hB == hT && wL == wR) { + outp[output_idx] = inp[hT * iw + wL]; + } else if (hB == hT) { + outp[output_idx] = w1 * inp[hT * iw + wL] + w2 * inp[hT * iw + wR]; + } else if (wL == wR) { + outp[output_idx] = h1 * inp[hT * iw + wL] + h2 * inp[hB * iw + wL]; + } else { + outp[output_idx] = h1 * w1 * inp[hT * iw + wL] + + h1 * w2 * inp[hT * iw + wR] + + h2 * w1 * inp[hB * iw + wL] + + h2 * w2 * inp[hB * iw + wR]; + } + + } + } + } + return SUCCESS; +} + EE resize_bilinear_x86_fp32( - TensorDesc inputDesc, F32 *input, TensorDesc outputDesc, F32 *tmp, F32 *output, ResizeParamSpec p) + TensorDesc inputDesc, F32 *input, ResizeParamSpec p, F32 *tmp, TensorDesc outputDesc, F32 *output) { DataType idt, odt; DataFormat idf, odf; @@ -291,6 +346,7 @@ EE resize_bilinear_x86_fp32( CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); CHECK_REQUIREMENT(idf == DF_NCHWC8 || idf == DF_NCHW); + EE ret = SUCCESS; U32 ocStep = oh * ow * 8; @@ -316,22 +372,6 @@ EE resize_bilinear_x86_fp32( F32 hC = infer_src(h, ih, oh, p.trans_mode); F32 wC = infer_src(w, iw, ow, p.trans_mode); U32 output_idx = h * ow * 8 + w * 8; - if (h == 0 && w == 0) { - copy[func_idx](input, output, icStep, ocStep, ic, inStep, onStep, on); - continue; - } else if (h == oh - 1 && w == ow - 1) { - copy[func_idx](input + ((ih - 1) * iw + iw - 1) * itile_size, output + output_idx, - icStep, ocStep, ic, inStep, onStep, on); - continue; - } else if (h == 0 && w == ow - 1) { - copy[func_idx](input + (iw - 1) * itile_size, output + output_idx, icStep, ocStep, - ic, inStep, onStep, on); - continue; - } else if (h == oh - 1 && w == 0) { - copy[func_idx](input + (ih - 1) * iw * itile_size, output + output_idx, icStep, - ocStep, ic, inStep, onStep, on); - continue; - } // process edge pixel, linear hC = UNI_MIN(ih - 1, UNI_MAX(0, hC)); @@ -390,17 +430,10 @@ EE resize_bilinear_x86_fp32( } } I32 mainc = c; - for (; c < (I32)oc - 3; c += 4) { - for (I32 hw = 0; hw < ohow; ++hw) { - outArray[n * oc * ohow + c * ohow + hw] = - output[n * oc * ohow + mainc * ohow + hw * 4 + (c - mainc)]; - } - } - mainc = c; for (; c < (I32)oc; ++c) { for (I32 hw = 0; hw < ohow; ++hw) { outArray[n * oc * ohow + c * ohow + hw] = - output[n * oc * ohow + mainc * ohow + hw * ((I32)oc - mainc) + (c - mainc)]; + output[n * oc * ohow + mainc * ohow + hw * 8 + (c - mainc)]; } } } @@ -411,10 +444,10 @@ EE resize_bilinear_x86_fp32( EE resize_bilinear_x86(TensorDesc inputDesc, void *input, - TensorDesc outputDesc, + ResizeParamSpec p, void *tmp, - void *output, - ResizeParamSpec p) + TensorDesc outputDesc, + void *output) { DataType idt, odt; DataFormat idf, odf; @@ -425,8 +458,13 @@ EE resize_bilinear_x86(TensorDesc inputDesc, EE ret = NOT_SUPPORTED; switch (idt) { case DT_F32: - ret = resize_bilinear_x86_fp32( - inputDesc, (F32 *)input, outputDesc, (F32 *)tmp, (F32 *)output, p); + if (idf == DF_NCHW && odf == DF_NCHW) { + ret = resize_bilinear_x86_fp32_nchw( + inputDesc, (F32 *)input, p, (F32 *)tmp, outputDesc, (F32 *)output); + } else { + ret = resize_bilinear_x86_fp32( + inputDesc, (F32 *)input, p, (F32 *)tmp, outputDesc, (F32 *)output); + } default: break; } diff --git a/compute/image/src/gpu/mali/cl/kernel_option/resize_opt.h b/compute/image/src/gpu/mali/cl/kernel_option/resize_opt.h index 801ddc61..f3c14b3f 100644 --- a/compute/image/src/gpu/mali/cl/kernel_option/resize_opt.h +++ b/compute/image/src/gpu/mali/cl/kernel_option/resize_opt.h @@ -15,22 +15,22 @@ inline EE set_resize_nearest_opt_mali(ResizeParamSpec p, CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName)); std::string modeName = ""; switch (p.trans_mode) { - case HALF_PIXEL: { + case COORDINATE_TRANS_HALF_PIXEL: { modeName = "_half_pixel"; CHECK_STATUS(set_chars_define_opt("USE_HALF_PIXEL", opt)); break; } - case PYTORCH_HALF_PIXEL: { + case COORDINATE_TRANS_PYTORCH_HALF_PIXEL: { modeName = "_pytorch_half_pixel"; CHECK_STATUS(set_chars_define_opt("USE_PYTORCH_HALF_PIXEL", opt)); break; } - case ALIGN_CORNERS: { + case COORDINATE_TRANS_ALIGN_CORNERS: { modeName = "_align_corners"; CHECK_STATUS(set_chars_define_opt("USE_ALIGN_CORNERS", opt)); break; } - case ASYMMETRIC: { + case COORDINATE_TRANS_ASYMMETRIC: { modeName = "_asymmetric"; CHECK_STATUS(set_chars_define_opt("USE_ASYMMETRIC", opt)); break; @@ -42,8 +42,9 @@ inline EE set_resize_nearest_opt_mali(ResizeParamSpec p, if (useNchwFormat) { formatName = "nchw"; } - sprintf(kernelName, "resize_nearest_%s%s%s", ioMemName, formatName.c_str(), modeName.c_str()); - sprintf(kernelOpt->sourceName, "resize_nearest"); + std::string kernel = std::string("resize_nearest_") + ioMemName + formatName + modeName; + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "resize_nearest"); if (useNchwFormat) { CHECK_STATUS(set_chars_define_opt("USE_NCHW", opt)); } @@ -66,8 +67,9 @@ inline EE set_resize_bilinear_opt_mali(bool useNchwFormat, if (useNchwFormat) { formatName = "nchw"; } - sprintf(kernelName, "resize_bilinear_%s%s", ioMemName, formatName.c_str()); - sprintf(kernelOpt->sourceName, "resize_bilinear"); + std::string kernel = std::string("resize_bilinear_") + ioMemName + formatName; + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "resize_bilinear"); if (useNchwFormat) { CHECK_STATUS(set_chars_define_opt("USE_NCHW", opt)); } diff --git a/compute/image/src/gpu/mali/fp16/resize_mali_fp16.cpp b/compute/image/src/gpu/mali/fp16/resize_mali_fp16.cpp index 9ba37323..91301879 100644 --- a/compute/image/src/gpu/mali/fp16/resize_mali_fp16.cpp +++ b/compute/image/src/gpu/mali/fp16/resize_mali_fp16.cpp @@ -97,7 +97,7 @@ inline EE resize_nearest_core_mali_fp16(GCLHandle_t handle, GCLMemType outputMemType = output->desc.memType; F32 ratiow, ratioh; - if (p.trans_mode == ALIGN_CORNERS) { + if (p.trans_mode == COORDINATE_TRANS_ALIGN_CORNERS) { ratiow = (iw - 1.0) / (ow - 1.0); ratioh = (ih - 1.0) / (oh - 1.0); } else { diff --git a/compute/image/src/grid_sample.cpp b/compute/image/src/grid_sample.cpp new file mode 100644 index 00000000..36a49754 --- /dev/null +++ b/compute/image/src/grid_sample.cpp @@ -0,0 +1,65 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "image.h" +#ifdef _USE_CPU +#include "cpu/image_cpu.h" +#endif + +EE grid_sample_infer_output_size( + Tensor *inputTensor, Tensor *gridTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc gridDesc = gridTensor->get_desc(); + TensorDesc outputDesc = outputTensor->get_desc(); + auto arch = archInfo->arch; + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { + ret = grid_sample_infer_output_size_cpu(inputDesc, gridDesc, &outputDesc); + } + outputTensor->resize(outputDesc); + return ret; +} + +EE grid_sample_infer_forward_tmp_bytes(Tensor inputTensor, + Tensor gridTensor, + GridSampleParamSpec p, + Tensor outputTensor, + U32 *bytes, + ArchInfo_t archInfo) +{ + *bytes = 0; + return SUCCESS; +} + +EE grid_sample(Tensor inputTensor, + Tensor gridTensor, + GridSampleParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc gridDesc = gridTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + void *grid = get_ptr_from_tensor(gridTensor, arch); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + void *output = get_ptr_from_tensor(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { + ret = grid_sample_cpu(inputDesc, input, gridDesc, grid, p, tmp, outputDesc, output); + } + return ret; +} diff --git a/compute/image/src/image_processing.cpp b/compute/image/src/image_processing.cpp index 3b5a754a..6b1c6fb1 100644 --- a/compute/image/src/image_processing.cpp +++ b/compute/image/src/image_processing.cpp @@ -11,6 +11,7 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#include #include "image.h" template @@ -87,7 +88,10 @@ std::shared_ptr get_resize_image( } ResizeParamSpec p; - p.mode = LINEAR; + p.mode = RESIZE_LINEAR; + p.trans_mode = COORDINATE_TRANS_ASYMMETRIC; + p.round_mode = ROUND_FLOOR; + // consider the dataformat if (targetImageFormat == RGB_SC) { // Specific for Birealnet18, scale short edge to 224 first F32 scale = 224.0 / UNI_MIN(height, width); @@ -98,11 +102,9 @@ std::shared_ptr get_resize_image( height = (U32)(scale * height + 0.5); width = 224; } - Tensor scaleTensor; TensorDesc scaledDesc = tensor4df(imageDt, imageDf, imageNum, imageChannel, height, width); - scaleTensor.resize(scaledDesc); - scaleTensor.alloc(); - resize(rgbTensor, temp, scaleTensor, p, &archInfo); + Tensor scaleTensor = Tensor::alloc_sized(scaledDesc); + resize(rgbTensor, p, temp, scaleTensor, &archInfo); U32 h0 = (U32)((height - 224) * 0.5); U32 w0 = (U32)((width - 224) * 0.5); @@ -113,14 +115,14 @@ std::shared_ptr get_resize_image( for (U32 w = w0; w < w0 + imageWidth; w++) { T value = (scaled[c * height * width + h * width + w] / 255 - meanRGBSC[c]) / stdRGBSC[c]; - CHECK_REQUIREMENT(!UNI_ISNAN(value)); + CHECK_REQUIREMENT(!isnan((float)value)); *transferSpacePtrMov = value; transferSpacePtrMov++; } } } } else if (targetImageFormat == RGB_RAW) { - resize(rgbTensor, temp, *transferSpaceTensor.get(), p, &archInfo); + resize(rgbTensor, p, temp, *transferSpaceTensor.get(), &archInfo); } else if (targetImageFormat == RGB_SC_RAW || targetImageFormat == BGR_SC_RAW) { F32 scale = 256.0 / UNI_MIN(height, width); if (height < width) { @@ -130,11 +132,9 @@ std::shared_ptr get_resize_image( height = (U32)(scale * (F32)height + 0.5); width = 256; } - Tensor scaleTensor; TensorDesc scaledDesc = tensor4df(imageDt, imageDf, imageNum, imageChannel, height, width); - scaleTensor.resize(scaledDesc); - scaleTensor.alloc(); - resize(rgbTensor, temp, scaleTensor, p, &archInfo); + Tensor scaleTensor = Tensor::alloc_sized(scaledDesc); + resize(rgbTensor, p, temp, scaleTensor, &archInfo); U32 h0 = (U32)((height - 224) * 0.5); U32 w0 = (U32)((width - 224) * 0.5); @@ -142,16 +142,14 @@ std::shared_ptr get_resize_image( T *scaled = (T *)get_ptr_from_tensor(scaleTensor, arch); for (U32 c : transform) { for (U32 h = h0; h < h0 + 224; h++) { - memcpy(transferSpacePtrMov, scaled + c * height * width + h * width + w0, + UNI_MEMCPY(transferSpacePtrMov, scaled + c * height * width + h * width + w0, 224 * bytesOf(imageDt)); transferSpacePtrMov += 224; } } } else { - Tensor scaleTensor; - scaleTensor.resize(imageDesc); - scaleTensor.alloc(); - resize(rgbTensor, temp, scaleTensor, p, &archInfo); + Tensor scaleTensor = Tensor::alloc_sized(imageDesc); + resize(rgbTensor, p, temp, scaleTensor, &archInfo); T *resized = (T *)get_ptr_from_tensor(scaleTensor, arch); for (U32 c : transform) { @@ -160,7 +158,7 @@ std::shared_ptr get_resize_image( T value = (resized[c * imageHeight * imageWidth + h * imageWidth + w] - 1.0 * meanRGB[c]) * scaleValue; - CHECK_REQUIREMENT(!UNI_ISNAN(value)); + CHECK_REQUIREMENT(!isnan((float)value)); *transferSpacePtrMov = value; transferSpacePtrMov++; } diff --git a/compute/image/src/resize.cpp b/compute/image/src/resize.cpp index 09591837..0589a895 100644 --- a/compute/image/src/resize.cpp +++ b/compute/image/src/resize.cpp @@ -27,38 +27,32 @@ #ifdef _USE_X86 #include "cpu/x86/image_x86.h" #endif -#include // params is a pointer to either the target size or the resize ratios // When paramDT specifies DT_U32, params should point to target sizes (height and width) // When paramDT specifies DT_F32, params should point to resize ratios -EE resize_infer_output_size_cpu( - TensorDesc inputDesc, DataType paramDT, void *params, TensorDesc *outputDesc, U32 *outputBytes) +EE resize_infer_output_size_cpu(TensorDesc inputDesc, ResizeParamSpec p, TensorDesc *outputDesc) { - if (nullptr == outputDesc || nullptr == outputBytes) { - CHECK_STATUS(NULL_POINTER); - } DataType idt; DataFormat idf, odf; - U32 in, ic, ih, iw; - U32 oh, ow; - CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - - switch (paramDT) { - case DT_F32: { - F32 *scales = (F32 *)params; - oh = ih * scales[0]; - ow = iw * scales[1]; - break; - } - case DT_U32: { - U32 *len = (U32 *)params; - oh = len[0]; - ow = len[1]; - break; + U32 in, ic, ih, iw = 1; + U32 oh, ow = 1; + if (tensorIs3d(inputDesc)) { + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ic, &ih)); + } else if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + } else { + UNI_ERROR_LOG("can support to resize %d-dim tensor.\n", inputDesc.nDims); + } + if (p.num_sizes > 0) { + oh = p.sizes[0]; + if (p.num_sizes > 1) { + ow = p.sizes[1]; } - default: { - return NOT_SUPPORTED; + } else { + oh = ih * p.scales[2]; + if (p.num_scales > 3) { + ow = iw * p.scales[3]; } } if (ic % 8 == 0) { @@ -66,28 +60,23 @@ EE resize_infer_output_size_cpu( } else { odf = idf; } - *outputDesc = tensor4df(idt, odf, in, ic, oh, ow); - *outputBytes = tensorNumBytes(*outputDesc); + if (tensorIs3d(inputDesc)) { + *outputDesc = tensor3df(idt, odf, in, ic, oh); + } else if (tensorIs4d(inputDesc)) { + *outputDesc = tensor4df(idt, odf, in, ic, oh, ow); + } return SUCCESS; } -EE resize_infer_output_size(Tensor *inputTensor, - DataType paramDT, - void *params, - Tensor *outputTensor, - U32 *outputBytes, - ArchInfo_t archInfo) +EE resize_infer_output_size( + Tensor *inputTensor, ResizeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) { - if (inputTensor == nullptr) { - CHECK_STATUS(NULL_POINTER); - } - if (outputTensor == nullptr) { + if (inputTensor == nullptr || outputTensor == nullptr) { CHECK_STATUS(NULL_POINTER); } TensorDesc inputDesc = inputTensor->get_desc(); TensorDesc outputDesc = outputTensor->get_desc(); - EE ret = NOT_SUPPORTED; - ret = resize_infer_output_size_cpu(inputDesc, paramDT, params, &outputDesc, outputBytes); + EE ret = resize_infer_output_size_cpu(inputDesc, p, &outputDesc); if (IS_GPU(archInfo->arch)) { #ifdef _USE_GPU outputDesc.df = inputDesc.df; @@ -97,23 +86,49 @@ EE resize_infer_output_size(Tensor *inputTensor, return ret; } +EE resize_infer_forward_tmp_bytes( + Tensor inputTensor, ResizeParamSpec p, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo) +{ + if (bytes == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); + *bytes = 0; + auto arch = archInfo->arch; + if (IS_GPU(arch)) { + if (inputDesc.df == DF_NCHW && inputTensor.get_mem_type() != OCLMem) { + *bytes = tensorNumBytes(inputDesc); + } + } else { + if (DF_NCHW == inputDesc.df && (IS_ARM(arch) || IS_X86(arch))) { + int channelAxis = inputDesc.nDims - 2; + U32 paddedC = (inputDesc.dims[channelAxis] + 7) / 8 * 8; + inputDesc.dims[channelAxis] = paddedC; + outputDesc.dims[channelAxis] = paddedC; + *bytes = tensorNumBytes(inputDesc) + tensorNumBytes(outputDesc); + } + } + return SUCCESS; +} + EE resize_bilinear(TensorDesc inputDesc, void *input, + ResizeParamSpec p, + void *tmp, TensorDesc outputDesc, void *output, - void *tmp, - ResizeParamSpec p, ArchInfo_t archInfo) { auto arch = archInfo->arch; EE ret = NOT_SUPPORTED; if (IS_GENERAL(arch)) { #ifdef _USE_GENERAL - ret = resize_bilinear_general(inputDesc, input, outputDesc, output); + ret = resize_bilinear_general(inputDesc, input, p, outputDesc, output); #endif #ifdef _USE_X86 } else if (IS_X86(arch)) { - ret = resize_bilinear_x86(inputDesc, input, outputDesc, tmp, output, p); + ret = resize_bilinear_x86(inputDesc, input, p, tmp, outputDesc, output); #endif #ifdef _USE_NEON } else if (IS_ARM(arch)) { @@ -131,7 +146,7 @@ EE resize_bilinear(TensorDesc inputDesc, outputARM = inputARM + tensorNumBytes(inDescARM); transformNCHWToNCHWC8(inputDesc, input, inDescARM, inputARM); } - ret = resize_bilinear_arm(inDescARM, inputARM, outDescARM, outputARM); + ret = resize_bilinear_arm(inDescARM, inputARM, p, outDescARM, outputARM); if (DF_NCHWC8 != outputDesc.df) { transformToNCHW(outDescARM, outputARM, outputDesc, output); } @@ -142,16 +157,15 @@ EE resize_bilinear(TensorDesc inputDesc, (GCLMem_t)input, outputDesc, (GCLMem_t)tmp, (GCLMem_t)output); #endif } - CHECK_STATUS(ret); return ret; } EE resize_nearest(TensorDesc inputDesc, void *input, + ResizeParamSpec p, + void *tmp, TensorDesc outputDesc, void *output, - void *tmp, - ResizeParamSpec p, ArchInfo_t archInfo) { auto arch = archInfo->arch; @@ -170,7 +184,7 @@ EE resize_nearest(TensorDesc inputDesc, } EE resize( - Tensor inputTensor, Tensor tmpTensor, Tensor outputTensor, ResizeParamSpec p, ArchInfo_t archInfo) + Tensor inputTensor, ResizeParamSpec p, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo) { auto arch = archInfo->arch; TensorDesc inputDesc = inputTensor.get_desc(); @@ -179,30 +193,36 @@ EE resize( void *output = get_ptr_from_tensor(outputTensor, arch); void *tmp = get_ptr_from_tensor(tmpTensor, arch); + if (inputDesc.nDims == 3) { + for (int i = inputDesc.nDims; i > 0; i--) { + inputDesc.dims[i] = inputDesc.dims[i - 1]; + outputDesc.dims[i] = outputDesc.dims[i - 1]; + } + inputDesc.nDims++; + outputDesc.nDims++; + } DataType idt, odt; DataFormat idf, odf; U32 in, ic, ih, iw; U32 on, oc, oh, ow; CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - CHECK_REQUIREMENT(in == on && ic == oc); if (ih == oh && iw == ow && IS_CPU(arch)) { - memcpy(output, input, tensorNumBytes(inputDesc)); + UNI_MEMCPY(output, input, tensorNumBytes(inputDesc)); return SUCCESS; } - EE ret; + EE ret = NOT_SUPPORTED; switch (p.mode) { - case NEAREST: - ret = resize_nearest(inputDesc, input, outputDesc, output, tmp, p, archInfo); + case RESIZE_NEAREST: + ret = resize_nearest(inputDesc, input, p, tmp, outputDesc, output, archInfo); break; - case LINEAR: - ret = resize_bilinear(inputDesc, input, outputDesc, output, tmp, p, archInfo); + case RESIZE_LINEAR: + ret = resize_bilinear(inputDesc, input, p, tmp, outputDesc, output, archInfo); break; default: - ret = NOT_SUPPORTED; break; } return ret; diff --git a/compute/image/tests/test_image_processing.cpp b/compute/image/tests/test_image_processing.cpp index 427ed21e..36232a8f 100644 --- a/compute/image/tests/test_image_processing.cpp +++ b/compute/image/tests/test_image_processing.cpp @@ -20,7 +20,7 @@ int main() TensorDesc rgbDesc = tensor4df(DT_U8, DF_RGB, 1, 3, 1280, 960); U8 *rgb = ut_input_v(tensorNumElements(rgbDesc), DT_U8, UT_INIT_POS); Tensor rgbTensor = Tensor::alloc_sized(rgbDesc); - memcpy(get_ptr_from_tensor(rgbTensor, ARM_A76), rgb, tensorNumBytes(rgbDesc)); + UNI_MEMCPY(get_ptr_from_tensor(rgbTensor, ARM_A76), rgb, tensorNumBytes(rgbDesc)); TensorDesc imageDesc = tensor4df(DT_F32, DF_NCHW, 1, 3, 224, 224); load_resize_image(rgbTensor, imageDesc, RGB, 0.017); diff --git a/compute/image/tests/test_image_resize.cpp b/compute/image/tests/test_image_resize.cpp index 369590fc..02e4e030 100644 --- a/compute/image/tests/test_image_resize.cpp +++ b/compute/image/tests/test_image_resize.cpp @@ -27,46 +27,46 @@ int resizeTest(int argc, char *argv[], DataType dt) U32 oc = atoi(argv[6]); U32 oh = atoi(argv[7]); U32 ow = atoi(argv[8]); - ArchInfo archInfo; - archInfo.arch = UT_ARCH; - ArchInfo archInfo_org; - archInfo_org.arch = CPU_GENERAL; - CHECK_REQUIREMENT(in == 1 && on == 1); - TensorDesc inputDesc, outputDesc; - inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); + ArchInfo archInfo; + archInfo.arch = UT_ARCH; - DataType paramDT = DT_F32; - F32 scales[2]; - scales[0] = (F32)oh / (F32)ih; - scales[1] = (F32)ow / (F32)iw; + ResizeParamSpec p; + p.mode = RESIZE_LINEAR; + p.trans_mode = COORDINATE_TRANS_ASYMMETRIC; + p.num_sizes = 0; + p.num_scales = 4; + p.scales[0] = oh; + p.scales[1] = ow; + p.scales[2] = (F32)oh / (F32)ih; + p.scales[3] = (F32)ow / (F32)iw; // setup input + TensorDesc inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); U8 *input = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); - Tensor inputTensor; - inputTensor.resize(inputDesc); - inputTensor.alloc(); - memcpy(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); + Tensor inputTensor = Tensor::alloc_sized(inputDesc); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, UT_ARCH), input, tensorNumBytes(inputDesc)); // setup output - U32 outputBytes; Tensor outputTensor; - CHECK_STATUS(resize_infer_output_size( - &inputTensor, paramDT, scales, &outputTensor, &outputBytes, &archInfo)); - outputDesc = outputTensor.get_desc(); + CHECK_STATUS(resize_infer_output_size(&inputTensor, p, &outputTensor, &archInfo)); + TensorDesc outputDesc = outputTensor.get_desc(); CHECK_REQUIREMENT(tensorNumElements(outputDesc) == on * oc * oh * ow); outputTensor.alloc(); Tensor outputTensorRef = Tensor::alloc_sized(outputDesc); - Tensor tmpTensor = Tensor::alloc_sized(tensor1d(DT_U8, 8 * tensorNumBytes(inputDesc))); - - ResizeParamSpec p; - p.mode = LINEAR; + U32 cpuTmpBytes = 0, cpuTmpBytesSerial = 0; + CHECK_STATUS( + resize_infer_forward_tmp_bytes(inputTensor, p, outputTensor, &cpuTmpBytes, &archInfo)); + CHECK_STATUS(resize_infer_forward_tmp_bytes( + inputTensor, p, outputTensorRef, &cpuTmpBytesSerial, &UT_SERIAL_ARCHINFO)); + Tensor tmpTensor = Tensor::alloc_sized(tensor1d(DT_I8, cpuTmpBytes)); + Tensor tmpTensorSerial = Tensor::alloc_sized(tensor1d(DT_I8, cpuTmpBytesSerial)); if (UT_CHECK) { - CHECK_STATUS(resize(inputTensor, tmpTensor, outputTensor, p, &archInfo)); + CHECK_STATUS(resize(inputTensor, p, tmpTensor, outputTensor, &archInfo)); // naive implement - CHECK_STATUS(resize(inputTensor, tmpTensor, outputTensorRef, p, &archInfo_org)); + CHECK_STATUS(resize(inputTensor, p, tmpTensorSerial, outputTensorRef, &UT_SERIAL_ARCHINFO)); // check ut_check_v(get_ptr_from_tensor(outputTensor, UT_ARCH), @@ -77,7 +77,7 @@ int resizeTest(int argc, char *argv[], DataType dt) // benchmark double time_start = ut_time_ms(); for (int iter = 0; iter < UT_LOOPS; iter++) { - CHECK_STATUS(resize(inputTensor, tmpTensor, outputTensor, p, &archInfo)); + CHECK_STATUS(resize(inputTensor, p, tmpTensor, outputTensor, &archInfo)); } double time_end = ut_time_ms(); double time = (time_end - time_start) / UT_LOOPS; diff --git a/compute/image/tests/test_image_resize_ocl.cpp b/compute/image/tests/test_image_resize_ocl.cpp index 564418ec..95fe27fe 100644 --- a/compute/image/tests/test_image_resize_ocl.cpp +++ b/compute/image/tests/test_image_resize_ocl.cpp @@ -31,72 +31,64 @@ int resizeTest(int argc, char *argv[], DataType dt) CHECK_REQUIREMENT(in == 1 && on == 1); - ArchInfo archInfo; - archInfo.arch = MALI; - ArchInfo archInfo_org; - archInfo_org.arch = CPU_GENERAL; - - TensorDesc inputDesc_cpu, inputDesc_gpu, outputDesc_cpu, outputDesc_gpu; - inputDesc_cpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); - inputDesc_gpu = tensor4df(dt, DF_NCHW, in, ic, ih, iw); - - DataType paramDT = DT_U32; - U32 scales[2]; - scales[0] = oh; - scales[1] = ow; + ResizeParamSpec p; + //p.mode = RESIZE_LINEAR; + p.mode = RESIZE_NEAREST; + p.trans_mode = COORDINATE_TRANS_ASYMMETRIC; + p.num_scales = 0; + p.num_sizes = 2; + p.sizes[0] = oh; + p.sizes[1] = ow; // setup input + TensorDesc inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); U8 *input_cpu = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); - - Tensor inputTensorCpu; - inputTensorCpu.resize(inputDesc_cpu); - inputTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(inputDesc_cpu)); + Tensor inputTensorCpu = Tensor::alloc_sized(inputDesc); + UNI_MEMCPY(get_ptr_from_tensor(inputTensorCpu, UT_ARCH), input_cpu, tensorNumBytes(inputDesc)); Tensor outputTensorCpu; - Tensor tmpTensorCpu; - U32 outputBytes; - CHECK_STATUS(resize_infer_output_size( - &inputTensorCpu, paramDT, scales, &outputTensorCpu, &outputBytes, &archInfo_org)); + CHECK_STATUS( + resize_infer_output_size(&inputTensorCpu, p, &outputTensorCpu, &UT_SERIAL_ARCHINFO)); outputTensorCpu.alloc(); + U32 cpuTmpBytes = 0; + CHECK_STATUS(resize_infer_forward_tmp_bytes( + inputTensorCpu, p, outputTensorCpu, &cpuTmpBytes, &UT_SERIAL_ARCHINFO)); + Tensor tmpTensorCpu = Tensor::alloc_sized(tensor1d(DT_I8, cpuTmpBytes)); - ResizeParamSpec p; - //p.mode = LINEAR; - p.mode = NEAREST; - p.trans_mode = ASYMMETRIC; // CPU output - CHECK_STATUS(resize(inputTensorCpu, tmpTensorCpu, outputTensorCpu, p, &archInfo_org)); - std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; + CHECK_STATUS(resize(inputTensorCpu, p, tmpTensorCpu, outputTensorCpu, &UT_SERIAL_ARCHINFO)); + ArchInfo archInfo; + archInfo.arch = MALI; + std::shared_ptr handleSharedPtr = OCLContext::getInstance().handle; GCLHandle_t handle = handleSharedPtr.get(); std::vector kernelVec; handle->kernelVec = &kernelVec; - Tensor inputTensor = Tensor(OCLMem); - Tensor outputTensor = Tensor(OCLMem); - Tensor tmpTensor = Tensor(OCLMem); - inputTensor.resize(inputDesc_gpu); - MaliPara maliPara; maliPara.handle = handle; archInfo.archPara = &maliPara; - CHECK_STATUS(resize_infer_output_size( - &inputTensor, paramDT, scales, &outputTensor, &outputBytes, &archInfo)); + Tensor inputTensor = Tensor(OCLMem); + Tensor outputTensor = Tensor(OCLMem); + Tensor tmpTensor = Tensor(OCLMem); + inputTensor.resize(inputDesc); + + CHECK_STATUS(resize_infer_output_size(&inputTensor, p, &outputTensor, &archInfo)); U32 maxBytes = 0; U32 tmpBytes = 0; GCLMem_t output = alloc(outputTensor); GCLMem_t input = alloc(inputTensor); CHECK_STATUS(gcl_fill_memory_zero(handle, input)); - outputDesc_gpu = outputTensor.get_desc(); + TensorDesc outputDesc_gpu = outputTensor.get_desc(); U8 *output_gpu = ut_input_v(on * oc * oh * ow, dt, UT_INIT_RANDOM); - tmpBytes = tensorNumBytes(inputDesc_gpu); + tmpBytes = tensorNumBytes(inputDesc); maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; tmpBytes = tensorNumBytes(outputDesc_gpu); maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; GCLMem_t tmpbuf = alloc_bytes(tmpTensor, maxBytes); - CHECK_STATUS(ocl_set_input(handle, input, inputDesc_gpu, input_cpu, tmpbuf, true)); + CHECK_STATUS(ocl_set_input(handle, input, inputDesc, input_cpu, tmpbuf, true)); - CHECK_STATUS(resize(inputTensor, tmpTensor, outputTensor, p, &archInfo)); + CHECK_STATUS(resize(inputTensor, p, tmpTensor, outputTensor, &archInfo)); /*warp up*/ UNI_INFO_LOG("warm up gpu:\n") for (U32 i = 0; i < 2; i++) { diff --git a/compute/tensor/include/feature.h b/compute/tensor/include/feature.h new file mode 100644 index 00000000..15cba329 --- /dev/null +++ b/compute/tensor/include/feature.h @@ -0,0 +1,254 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_FEATURE +#define _H_FEATURE + +#include +#include +#include + +#include "farmhash.h" + +#include "tensor_desc.h" + +typedef enum Combiner { + Combiner_Mean, + Combiner_Sum, +} Combiner; +typedef int TYPE; + +inline uint64_t shift_mix(const uint64_t val) +{ + return val ^ (val >> 47); +} + +inline uint64_t FingerprintCat64(const uint64_t &fp1, const uint64_t &fp2) +{ + static const uint64_t kMul = 0xc6a4a7935bd1e995ULL; + uint64_t result = fp1 ^ kMul; + result ^= shift_mix(fp2 * kMul) * kMul; + result *= kMul; + result = shift_mix(result) * kMul; + result = shift_mix(result); + return result; +} + +static inline uint64_t tf_hash(const std::string &s) +{ + return ::util::Fingerprint64(s.data(), s.size()); +} + +static inline uint64_t tf_hash64(const uint64_t &s, const uint64_t &key) +{ + return FingerprintCat64(key, s); +} + +static inline uint64_t tf_hash64(const std::string &s, const uint64_t &key) +{ + return FingerprintCat64(key, tf_hash(s)); +} + +std::vector categorical_column_with_vocabulary_list(const std::vector &input, + std::map &vocab, + int default_value = -1, + int num_oov_buckets = 0) +{ + int vocab_size = vocab.size(); + std::vector ret(input.size()); + for (uint32_t i = 0; i < input.size(); i++) { + if (vocab.find(input[i]) != vocab.end()) { + ret[i] = vocab[input[i]]; + } else { + if (num_oov_buckets > 0) { + ret[i] = tf_hash(input[i]) % num_oov_buckets + vocab_size; + } else { + ret[i] = default_value; + } + } + } + return ret; +} + +std::vector categorical_column_with_hash_bucket( + const std::vector &input, int hash_bucket_size) +{ + std::vector ret(input.size()); + for (uint32_t i = 0; i < input.size(); i++) { + ret[i] = tf_hash(input[i]) % hash_bucket_size; + } + return ret; +} + +std::vector categorical_column_with_identity( + const std::vector &input, int bucket_size, int default_value = 0) +{ + std::vector ret(input.size()); + for (uint32_t i = 0; i < input.size(); i++) { + if (input[i] < bucket_size) { + ret[i] = input[i]; + } else { + ret[i] = default_value; + } + } + return ret; +} + +inline uint32_t quick_search( + const std::vector &data, const TYPE &query, const uint32_t &left, const uint32_t &right) +{ +#if 1 + for (int j = left; j < right; j++) { + if (query < data[j]) { + return j; + } + } +#else + if (left >= right) { + return left; + } + int mid = (left + right) / 2; + if (query < data[mid]) { + return quick_search(data, query, left, mid); + } else { + return quick_search(data, query, mid, right); + } +#endif +} + +std::vector bucketized_column( + const std::vector &input, const std::vector &boundaries) +{ + std::vector ret(input.size()); + uint32_t size = boundaries.size(); + for (uint32_t i = 0; i < input.size(); i++) { + ret[i] = quick_search(boundaries, input[i], 0, size); + ; + } + return ret; +} + +void indicator_column(const TensorDesc &input_desc, + const TYPE *input, + int categorical_num, + TensorDesc *output_desc, + TYPE *output, + const TYPE *weight = nullptr) +{ + *output_desc = input_desc; + output_desc->dims[0] = categorical_num; + uint32_t count = 1; + for (uint32_t i = 1; i < input_desc.nDims; i++) { + count *= input_desc.dims[i]; + } + + memset(output, 0, count * categorical_num * sizeof(TYPE)); + if (weight != nullptr) { + for (uint32_t i = 0, j = 0, n = 0; i < count; i++, j += categorical_num) { + for (uint32_t k = 0; k < input_desc.dims[0]; k++, n++) { + output[j + input[n]] += weight[n]; + } + } + } else { + for (uint32_t i = 0, j = 0, n = 0; i < count; i++, j += categorical_num) { + for (uint32_t k = 0; k < input_desc.dims[0]; k++, n++) { + output[j + input[n]]++; + } + } + } +} + +template +std::vector crossed_column(const std::vector &input0, + const std::vector &input1, + int hash_bucket_size, + const uint64_t hash_key = 0xDECAFCAFFE) +{ + std::vector ret(input0.size()); + for (uint32_t i = 0; i < input0.size(); i++) { + ret[i] = tf_hash64(input1[i], tf_hash64(input0[i], hash_key)) % hash_bucket_size; + } + return ret; +} + +template +std::vector numeric_column(const std::vector &input, + F const &normalizer_fn = nullptr, + int shape = 0, + TO default_value = -1) +{ + if (shape > 0) { + return std::vector(shape, default_value); + } + std::vector ret = std::vector(input.size()); + if (normalizer_fn == nullptr) { + for (uint32_t i = 0; i < input.size(); i++) { + ret[i] = input[i]; + } + } else { + for (uint32_t i = 0; i < input.size(); i++) { + ret[i] = normalizer_fn(input[i]); + } + } + return ret; +} + +template +inline void embedding_combine(const std::vector &input, const uint32_t &dimension, T *output) +{ + if (input.size() == 0) { + memset(output, 0, sizeof(T) * dimension); + return; + } + if (combiner == Combiner_Mean || combiner == Combiner_Sum) { + memcpy(output, input[0], sizeof(T) * dimension); + for (uint32_t i = 1; i < input.size(); i++) { + for (uint32_t j = 0; j < dimension; j++) { + output[j] += input[i][j]; + } + } + if (combiner == Combiner_Mean) { + for (uint32_t j = 0; j < dimension; j++) { + output[j] /= input.size(); + } + } + } else { + printf("[ERROR] currently not support combine function %d.\n", combiner); + exit(1); + } +} + +template +void embedding_column(const TensorDesc &input_desc, + const TYPE *input, + const T *vocab, + const uint32_t &dimension, + TensorDesc *output_desc, + T *output) +{ + *output_desc = input_desc; + output_desc->dims[0] = dimension; + uint32_t count = 1; + for (uint32_t i = 1; i < input_desc.nDims; i++) { + count *= input_desc.dims[i]; + } + + std::vector vec(input_desc.dims[0]); + for (uint32_t i = 0, j = 0; i < count; i++, output += dimension) { + for (uint32_t k = 0; k < input_desc.dims[0]; k++, j++) { + vec[k] = vocab + input[j] * dimension; + } + embedding_combine(vec, dimension, output); + } +} +#endif diff --git a/compute/tensor/include/tensor_computing.h b/compute/tensor/include/tensor_computing.h index 6b7992ca..2a562c14 100644 --- a/compute/tensor/include/tensor_computing.h +++ b/compute/tensor/include/tensor_computing.h @@ -172,6 +172,7 @@ EE depthwise_pointwise_convolution(std::vector inputTensors, Tensor pwFilterTensor, ConvolutionParamSpec convParamSpec, DepthwiseConvolutionForwardAlgorithm algorithm, + void *scale, Tensor dwBiasTensor, Tensor pwBiasTensor, std::vector tmpTensors, @@ -221,6 +222,7 @@ EE depthwise_convolution(Tensor inputTensor, Tensor filterTensor, ConvolutionParamSpec convParamSpec, DepthwiseConvolutionForwardAlgorithm algorithm, + void *scale, Tensor biasTensor, Tensor tmpTensor, Tensor outputTensor, @@ -272,7 +274,8 @@ EE activation( EE concat_infer_output_size( std::vector inputTensor, ConcatParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); -EE concat_infer_forward_tmp_bytes(std::vector inputTensor, U32 *bytes, ArchInfo_t archInfo); +EE concat_infer_forward_tmp_bytes( + std::vector inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo); EE concat(std::vector inputTensor, ConcatParamSpec p, @@ -320,14 +323,20 @@ EE fully_connected(Tensor inputTensor, EE softmax_infer_output_size( Tensor *inputTensor, SoftmaxParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); +EE softmax_infer_forward_tmp_bytes( + Tensor inputTensor, SoftmaxParamSpec p, U32 *bytes, ArchInfo_t archInfo); + EE softmax(Tensor inputTensor, SoftmaxParamSpec p, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo); -EE softmax_infer_forward_tmp_bytes( - Tensor inputTensor, SoftmaxParamSpec p, U32 *bytes, ArchInfo_t archInfo); +EE logsoftmax(Tensor inputTensor, + SoftmaxParamSpec p, + Tensor tmpTensor, + Tensor outputTensor, + ArchInfo_t archInfo); EE rnn_infer_output_size(std::vector inputTensor, RNNParamSpec rnnParamSpec, @@ -465,6 +474,7 @@ EE normalization_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, Ar EE normalization_infer_forward_tmp_bytes(Tensor inputTensor, U32 *bytes, ArchInfo_t archInfo); EE layer_normalization(Tensor inputTensor, + LayerNormParamSpec p, Tensor alphaTensor, Tensor betaTensor, Tensor tmpTensor, @@ -554,7 +564,8 @@ EE attention_infer_output_size(Tensor *inputTensor, AttentionParamSpec p, Tensor EE attention(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo); -EE power_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); +EE power_infer_output_size( + Tensor *inputTensor, PowerParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); EE power(Tensor inputTensor, PowerParamSpec p, Tensor outputTensor, ArchInfo_t archInfo); @@ -689,9 +700,12 @@ EE yolov3detectionoutput(std::vector inputTensor, Tensor outputTensor, ArchInfo_t archInfo); -EE preallocated_memory_infer_output_size(Tensor *outputTensor, ArchInfo_t archInfo); +EE preallocated_memory_infer_output_size(std::vector inputTensors, + PreAllocatedMemoryParamSpec p, + Tensor *outputTensor, + ArchInfo_t archInfo); -EE preallocated_memory(Tensor outputTensor, ArchInfo_t archInfo); +EE preallocated_memory(PreAllocatedMemoryParamSpec p, Tensor outputTensor, ArchInfo_t archInfo); EE copy_infer_output_size(std::vector inputTensor, ArchInfo_t archInfo); @@ -795,26 +809,16 @@ EE tile(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo); -EE where_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); +EE where_infer_output_size( + Tensor *xTensor, Tensor *yTensor, Tensor *outputTensor, ArchInfo_t archInfo); -EE where(Tensor inputTensor, - Tensor conditionTensor, - Tensor yTensor, - Tensor outputTensor, - ArchInfo_t archInfo); +EE where( + Tensor conditionTensor, Tensor xTensor, Tensor yTensor, Tensor outputTensor, ArchInfo_t archInfo); EE cast_infer_output_size( - Tensor *inputTensor, Tensor *outputTensor, CastParamSpec p, ArchInfo_t archInfo); - -EE cast(Tensor inputTensor, Tensor outputTensor, CastParamSpec p, ArchInfo_t archInfo); - -EE equal_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + Tensor *inputTensor, CastParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo); -EE equal(Tensor inputTensor, - Tensor compareTensor, - EqualParamSpec p, - Tensor outputTensor, - ArchInfo_t archInfo); +EE cast(Tensor inputTensor, CastParamSpec p, Tensor outputTensor, ArchInfo_t archInfo); EE quantize(Tensor inputTensor, Tensor *outputTensor, F32 *scale, ArchInfo_t archInfo); @@ -930,4 +934,15 @@ EE generate_proposals(Tensor deltaTensor, std::vector tmpTensors, Tensor outputTensor, ArchInfo_t archInfo); + +EE onehot_infer_output_size( + Tensor *inputTensor, OneHotParamSpec p, DataType type, Tensor *outputTensor, ArchInfo_t archInfo); + +EE onehot(Tensor inputTensor, OneHotParamSpec p, Tensor outputTensor, ArchInfo_t archInfo); + +EE cumsum_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo); + +EE cumsum(Tensor inputTensor, CumSumParamSpec p, Tensor outputTensor, ArchInfo_t archInfo); + +EE non_zero(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo); #endif diff --git a/compute/tensor/src/CMakeLists.txt b/compute/tensor/src/CMakeLists.txt index bc1b111d..14e606b2 100644 --- a/compute/tensor/src/CMakeLists.txt +++ b/compute/tensor/src/CMakeLists.txt @@ -4,6 +4,7 @@ if (USE_GENERAL) endif (USE_GENERAL) if (USE_NEON) + file(GLOB arm_int32_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int32/*.cpp) if (USE_FP32) file(GLOB arm_fp32_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/fp32/*.cpp) if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64") @@ -20,18 +21,19 @@ if (USE_NEON) if (USE_INT8) file(GLOB arm_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/*.cpp) if (USE_FP16) - file(GLOB armv8_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/v8/*.cpp) - else () + file(GLOB armv8_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/v8.2/*.cpp) + elseif (NOT "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64") file(GLOB armv7_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/int8/v7/*.cpp) endif () set(arm_int8_srcs "${arm_int8_srcs};${armv8_int8_srcs};${armv7_int8_srcs}") endif (USE_INT8) file(GLOB arm_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/arm/*.cpp) - set(arm_srcs "${arm_srcs};${arm_fp16_srcs};${arm_fp32_srcs};${arm_int8_srcs};${arm_bnn_srcs}") + set(arm_srcs "${arm_srcs};${arm_fp16_srcs};${arm_fp32_srcs};${arm_int8_srcs};${arm_bnn_srcs};${arm_int32_srcs}") file(GLOB cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/*.cpp) endif (USE_NEON) if (USE_X86) + file(GLOB x86_int32_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/x86/int32/*.cpp) if (USE_FP32) file(GLOB x86_fp32_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/x86/fp32/*.cpp) endif (USE_FP32) @@ -39,7 +41,7 @@ if (USE_X86) file(GLOB x86_int8_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/x86/int8/*.cpp) endif (USE_INT8) file(GLOB x86_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/x86/*.cpp) - set(x86_srcs "${x86_srcs};${x86_fp32_srcs};${x86_int8_srcs}") + set(x86_srcs "${x86_srcs};${x86_int32_srcs};${x86_fp32_srcs};${x86_int8_srcs}") file(GLOB cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/*.cpp) endif (USE_X86) @@ -58,6 +60,9 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) # shared library add_library(${PROJECT_NAME} SHARED ${srcs}) target_link_libraries(${PROJECT_NAME} LINK_PUBLIC blas_enhance uni) +if (USE_SECURE_C) + target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${SecureC_SHARED_LIBRARY}) +endif () # static library add_library(${PROJECT_NAME}_static STATIC ${srcs}) diff --git a/compute/tensor/src/attention.cpp b/compute/tensor/src/attention.cpp index 9f05d2c0..8541236e 100644 --- a/compute/tensor/src/attention.cpp +++ b/compute/tensor/src/attention.cpp @@ -61,8 +61,10 @@ EE attention_infer_output_size(Tensor *inputTensor, AttentionParamSpec p, Tensor DataFormat df; U32 batch, sequenceLength; CHECK_STATUS(tensor2dGet(inputDesc, &dt, &df, &batch, &sequenceLength)); + U32 oh = UNI_MIN(p.from_sequence_length, sequenceLength); + U32 ow = UNI_MIN(p.to_sequence_length, sequenceLength); outputDesc = - tensor4df(dt, DF_NCHW, batch, p.num_heads, p.from_sequence_length, p.to_sequence_length); + tensor4df(dt, DF_NCHW, batch, p.num_heads, oh, ow); outputTensor->resize(outputDesc); return SUCCESS; } diff --git a/compute/tensor/src/cast.cpp b/compute/tensor/src/cast.cpp index 243a08b5..1e7fe5e3 100644 --- a/compute/tensor/src/cast.cpp +++ b/compute/tensor/src/cast.cpp @@ -10,24 +10,25 @@ // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + #include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif #ifdef _USE_GPU #include "gpu/mali/tensor_computing_mali.h" #endif EE cast_infer_output_size( - Tensor *inputTensor, Tensor *outputTensor, CastParamSpec p, ArchInfo_t archInfo) + Tensor *inputTensor, CastParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) { - if (inputTensor == nullptr) { - CHECK_STATUS(NULL_POINTER); - } - if (outputTensor == nullptr) { - CHECK_STATUS(NULL_POINTER); + if (inputTensor == nullptr || outputTensor == nullptr) { + return NULL_POINTER; } TensorDesc inputDesc = inputTensor->get_desc(); TensorDesc outputDesc = outputTensor->get_desc(); outputDesc = inputDesc; - outputDesc.dt = p.targetDt; + outputDesc.dt = p.dt; if (IS_GPU(archInfo->arch)) { #ifdef _USE_GPU if (outputDesc.dt != DT_I32 && outputDesc.dt != DT_F16) { @@ -35,107 +36,29 @@ EE cast_infer_output_size( } #endif } - outputTensor->resize(outputDesc); - return SUCCESS; -} - -template -static EE diffSourceCastKernel(U32 len, TI *inputPtr, TO *outputPtr) -{ - for (U32 i = 0; i < len; ++i) { - outputPtr[i] = (TO)(inputPtr[i]); +#ifdef _USE_CPU + if (tensorIsShape(inputDesc)) { + outputDesc.dt = DT_U32; } - return SUCCESS; -} - -template -static EE diffSourceCast(TensorDesc inputDesc, T *inputPtr, void *outputPtr, CastParamSpec p) -{ - EE ret = SUCCESS; - U32 len = tensorNumElements(inputDesc); - switch (p.targetDt) { - case DT_I32: { - diffSourceCastKernel(len, inputPtr, (I32 *)outputPtr); - break; - } - case DT_U32: { - diffSourceCastKernel(len, inputPtr, (U32 *)outputPtr); - break; - } -#ifdef _USE_FP32 - case DT_F32: { - diffSourceCastKernel(len, inputPtr, (F32 *)outputPtr); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - diffSourceCastKernel(len, inputPtr, (F16 *)outputPtr); - break; - } #endif - case DT_U8: { - diffSourceCastKernel(len, inputPtr, (U8 *)outputPtr); - break; - } - case DT_I8: { - diffSourceCastKernel(len, inputPtr, (INT8 *)outputPtr); - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; + outputTensor->resize(outputDesc); + return SUCCESS; } -EE cast(Tensor inputTensor, Tensor outputTensor, CastParamSpec p, ArchInfo_t archInfo) +EE cast(Tensor inputTensor, CastParamSpec p, Tensor outputTensor, ArchInfo_t archInfo) { auto arch = archInfo->arch; TensorDesc inputDesc = inputTensor.get_desc(); void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); void *output = get_ptr_from_tensor(outputTensor, arch); - EE ret = NOT_SUPPORTED; if (IS_CPU(arch)) { #ifdef _USE_CPU - switch (inputDesc.dt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = diffSourceCast(inputDesc, (F32 *)input, output, p); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = diffSourceCast(inputDesc, (F16 *)input, output, p); - break; - } -#endif - case DT_U32: { - ret = diffSourceCast(inputDesc, (U32 *)input, output, p); - break; - } - case DT_I32: { - ret = diffSourceCast(inputDesc, (I32 *)input, output, p); - break; - } - case DT_U8: { - ret = diffSourceCast(inputDesc, (U8 *)input, output, p); - break; - } - case DT_I8: { - ret = diffSourceCast(inputDesc, (INT8 *)input, output, p); - break; - } - default: - ret = NOT_SUPPORTED; - break; - } + ret = cast_cpu(inputDesc, input, outputDesc, output); #endif #ifdef _USE_GPU } else if (IS_GPU(arch)) { - TensorDesc outputDesc = outputTensor.get_desc(); ret = cast_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, p, outputDesc, (GCLMem_t)output); #endif diff --git a/compute/tensor/src/check.cpp b/compute/tensor/src/check.cpp index c61c7feb..0a3707f9 100644 --- a/compute/tensor/src/check.cpp +++ b/compute/tensor/src/check.cpp @@ -12,14 +12,8 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_X86 -#include "cpu/x86/tensor_computing_x86.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" #endif #ifdef _USE_GPU #include "gpu/mali/tensor_computing_mali.h" @@ -39,17 +33,9 @@ EE check(Tensor inputTensorA, TensorDesc outputDesc = outputTensor.get_desc(); void *output = get_ptr_from_tensor(outputTensor, arch); EE ret = NOT_SUPPORTED; - if (IS_GENERAL(arch)) { + if (IS_CPU(arch)) { #ifdef _USE_GENERAL - ret = check_general(inputDescA, inputA, inputDescB, inputB, p, outputDesc, output); -#endif -#ifdef _USE_X86 - } else if (IS_X86(arch)) { - ret = check_x86(inputDescA, inputA, inputDescB, inputB, p, outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (IS_ARM(arch)) { - ret = check_arm(inputDescA, inputA, inputDescB, inputB, p, outputDesc, output); + ret = check_cpu(inputDescA, inputA, inputDescB, inputB, p, outputDesc, output); #endif #ifdef _USE_GPU } else if (IS_GPU(arch)) { @@ -63,27 +49,16 @@ EE check(Tensor inputTensorA, EE check_infer_output_size( std::vector inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) { - EE ret = NOT_SUPPORTED; if (outputTensor == nullptr) { CHECK_STATUS(NULL_POINTER); } - for (auto p : inputTensor) { - if (p == nullptr) { - CHECK_STATUS(NULL_POINTER); - } + TensorDesc outputDesc = inputTensor[0]->get_desc(); + if (inputTensor.size() > 1 && inputTensor[0]->length() < inputTensor[1]->length()) { + outputDesc = inputTensor[1]->get_desc(); } - TensorDesc inputDesc = inputTensor[0]->get_desc(); - TensorDesc outputDesc = outputTensor->get_desc(); - outputDesc.dt = DT_I32; - outputDesc.nDims = 1; - outputDesc.df = DF_NORMAL; - outputDesc.dims[0] = inputDesc.dims[inputDesc.nDims - 1]; + outputDesc.dt = DT_U8; if (IS_GPU(archInfo->arch)) { -#ifdef _USE_GPU - if (outputDesc.dims[0] > 1) { - CHECK_STATUS(NOT_SUPPORTED); - } -#endif + outputDesc.dt = DT_I32; } outputTensor->resize(outputDesc); return SUCCESS; diff --git a/compute/tensor/src/concat.cpp b/compute/tensor/src/concat.cpp index f80cf54a..f7c0e32c 100644 --- a/compute/tensor/src/concat.cpp +++ b/compute/tensor/src/concat.cpp @@ -22,15 +22,16 @@ inline void processInputDescs(std::vector *inputDesc, I32 axis) { - int inputNum = inputDesc->size(); - int axisInfo = (axis > 0) ? axis : ((*inputDesc)[0].nDims + axis); - axisInfo = (*inputDesc)[0].nDims - 1 - axisInfo; - for (int i = 0; i < (int)(*inputDesc)[0].nDims; i++) { - if (i == axisInfo) { + int num = inputDesc->size(); + int dim = (*inputDesc)[0].nDims; + axis = (axis + dim) % dim; + axis = dim - 1 - axis; + for (int i = 0; i < dim; i++) { + if (i == axis) { continue; } U32 minDim = (*inputDesc)[0].dims[i]; - for (int j = 1; j < inputNum; j++) { + for (int j = 1; j < num; j++) { if ((*inputDesc)[j].dims[i] < minDim) { minDim = (*inputDesc)[j].dims[i]; } @@ -38,7 +39,7 @@ inline void processInputDescs(std::vector *inputDesc, I32 axis) if (minDim == 0) { continue; } - for (int j = 0; j < inputNum; j++) { + for (int j = 0; j < num; j++) { (*inputDesc)[j].dims[i] = minDim; } } @@ -48,7 +49,7 @@ inline EE concat_infer_output_size_cpu( std::vector inputDesc, ConcatParamSpec p, TensorDesc *outputDesc) { if (inputDesc.size() < 1) { - CHECK_STATUS(NOT_MATCH); + return NOT_MATCH; } if (inputDesc.size() == 1) { *outputDesc = inputDesc[0]; @@ -70,11 +71,13 @@ inline EE concat_infer_output_size_cpu( axis = dim - 1 - axis; outputDesc->dims[axis] = 0; + int shapeCount = 0; for (U32 i = 0; i < inputDesc.size(); i++) { if (inputDesc[i].nDims == 0) { continue; } + shapeCount += tensorIsShape(inputDesc[i]); if (inputDesc[i].nDims != (U32)dim) { return NOT_MATCH; } @@ -101,7 +104,18 @@ inline EE concat_infer_output_size_cpu( outputDesc->df = DF_NCHW; } - return SUCCESS; + EE ret = SUCCESS; +#ifdef _USE_CPU + if (shapeCount > 0) { + std::vector input(inputDesc.size()); + for (U32 i = 0; i < inputDesc.size(); i++) { + input[i] = inputDesc[i].dims + inputDesc[i].nDims; + } + ret = concat_cpu(inputDesc, input, nullptr, p, nullptr, *outputDesc, + outputDesc->dims + outputDesc->nDims, nullptr); + } +#endif + return ret; } EE concat_infer_output_size( @@ -130,9 +144,11 @@ EE concat_infer_output_size( return ret; } -EE concat_infer_forward_tmp_bytes(std::vector inputTensor, U32 *bytes, ArchInfo_t archInfo) +EE concat_infer_forward_tmp_bytes( + std::vector inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo) { std::vector inputDesc = get_desc_from_tensors(inputTensor); + TensorDesc outputDesc = outputTensor.get_desc(); EE ret = NOT_SUPPORTED; if (IS_GPU(archInfo->arch)) { #ifdef _USE_GPU @@ -142,7 +158,9 @@ EE concat_infer_forward_tmp_bytes(std::vector inputTensor, U32 *bytes, A } else { *bytes = 0; for (auto p : inputDesc) { - *bytes += tensorNumBytes(p); + if (p.df != outputDesc.df) { + *bytes += tensorNumBytes(p); + } } ret = SUCCESS; } diff --git a/compute/tensor/src/convolution.cpp b/compute/tensor/src/convolution.cpp index d9e7e9ff..c9ffbd9e 100644 --- a/compute/tensor/src/convolution.cpp +++ b/compute/tensor/src/convolution.cpp @@ -57,9 +57,9 @@ inline EE convolution_infer_output_size_cpu(TensorDesc inputDesc, U32 ftDilated = (ft - 1) * p.dilatedRate_t + 1; U32 fhDilated = (fh - 1) * p.dilatedRate_h + 1; U32 fwDilated = (fw - 1) * p.dilatedRate_w + 1; - ot = (it + p.padding_before + p.padding_after - ftDilated) / p.stride_t + 1; - oh = (ih + p.padding_top + p.padding_bottom - fhDilated) / p.stride_h + 1; - ow = (iw + p.padding_left + p.padding_right - fwDilated) / p.stride_w + 1; + ot = (it + p.pad_before + p.pad_after - ftDilated) / p.stride_t + 1; + oh = (ih + p.pad_top + p.pad_bottom - fhDilated) / p.stride_h + 1; + ow = (iw + p.pad_left + p.pad_right - fwDilated) / p.stride_w + 1; if (ot < 0 || oh < 0 || ow < 0) { ret = NOT_MATCH; } @@ -377,9 +377,8 @@ EE convolution(std::vector inputTensors, } ret = convolution_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, filterDesc, (GCLMem_t)filter, convParamSpec, - ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, scaleDesc, (GCLMem_t)scale, - biasDesc, (GCLMem_t)bias, tmpBytes, tmpVec, outputDesc, (GCLMem_t)output, - activationDesc.mode); + ((MaliPara_t)(archInfo->archPara))->forwardRunInfo, scaleDesc, (GCLMem_t)scale, biasDesc, + (GCLMem_t)bias, tmpBytes, tmpVec, outputDesc, (GCLMem_t)output, activationDesc.mode); #endif } @@ -388,7 +387,7 @@ EE convolution(std::vector inputTensors, if (inputTensors.size() > 1 && isEltwiseSeperate) { std::vector eltwiseInputTensors = {outputTensor, inputTensors[1]}; EltwiseParamSpec eltwiseDesc; - eltwiseDesc.elt_mode = ELTWISE_SUM; + eltwiseDesc.mode = ELTWISE_SUM; eltwiseDesc.activation_type = eltwiseActDesc.mode; eltwiseDesc.activation_spec = convParamSpec.activation_spec; ret = eltwise(eltwiseInputTensors, eltwiseDesc, tmpTensors[0], outputTensor, archInfo); diff --git a/compute/tensor/src/copy.cpp b/compute/tensor/src/copy.cpp index 17b9da4a..3fcf8102 100644 --- a/compute/tensor/src/copy.cpp +++ b/compute/tensor/src/copy.cpp @@ -57,7 +57,7 @@ EE copy(std::vector inputTensor, UNI_ERROR_LOG("copy %u bytes from src tensor(%u) beyond size(%u).\n", copyLength, srcIndex, inputTensor[0].bytes()); } - memcpy((U8 *)input[1] + dstIndex, (U8 *)input[0] + srcIndex, copyLength); + UNI_MEMCPY((U8 *)input[1] + dstIndex, (U8 *)input[0] + srcIndex, copyLength); ret = SUCCESS; #endif } diff --git a/compute/tensor/src/cpu/argmax.cpp b/compute/tensor/src/cpu/argmax.cpp index a5bb6ba6..afb561d6 100644 --- a/compute/tensor/src/cpu/argmax.cpp +++ b/compute/tensor/src/cpu/argmax.cpp @@ -77,6 +77,10 @@ EE argmax_cpu( break; } #endif + case DT_I32: { + ret = argmax(inputDesc, (const I32 *)input, axis, outputDesc, (U32 *)output); + break; + } default: ret = NOT_SUPPORTED; break; diff --git a/compute/tensor/src/cpu/arm/arm_functions.h b/compute/tensor/src/cpu/arm/arm_functions.h index 20c2fdb8..3c1d60d5 100644 --- a/compute/tensor/src/cpu/arm/arm_functions.h +++ b/compute/tensor/src/cpu/arm/arm_functions.h @@ -107,6 +107,9 @@ inline EE array_minmax_value_arm(DataType dt, const void *data, I32 len, int mod case DT_I32: ret = array_minmax_value_i32((const I32 *)data, len, mode, result); break; + case DT_U32: + ret = array_minmax_value_template((const U32 *)data, len, mode, result); + break; default: ret = NOT_SUPPORTED; break; diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A55.cpp b/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A55.cpp index 4e70ee93..f46dbd41 100644 --- a/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A55.cpp +++ b/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A55.cpp @@ -44,10 +44,10 @@ EE convolution_dorefa_A55(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; if (fdf != DF_NCHWN16C8) { CHECK_STATUS(NOT_MATCH); diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A76.cpp b/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A76.cpp index 83cb462a..bef43159 100644 --- a/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A76.cpp +++ b/compute/tensor/src/cpu/arm/bnn/convolution_dorefa_A76.cpp @@ -44,10 +44,10 @@ EE convolution_dorefa_A76(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; if (fdf != DF_NCHWN16C8) { CHECK_STATUS(NOT_MATCH); diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_transform_bnn.h b/compute/tensor/src/cpu/arm/bnn/convolution_transform_bnn.h index 9d9329a6..8afeed50 100644 --- a/compute/tensor/src/cpu/arm/bnn/convolution_transform_bnn.h +++ b/compute/tensor/src/cpu/arm/bnn/convolution_transform_bnn.h @@ -15,8 +15,8 @@ #define _H_CONVOLUTION_TRANSFORM_BNN #include -#include +#include "uni.h" #include "tensor_desc.h" inline void bitwise_copy(BIN8 srcVal, U32 srcBit, BIN8 *dest, U32 destBit) @@ -46,7 +46,7 @@ inline EE convolution_transform_filter_bnn( switch (fdf) { case DF_NCHWN16C8: // Everything is ready - memcpy(ftmArray, filterArray, fn * fc * fh * fw / 8 * bytesOf(fdt)); + UNI_MEMCPY(ftmArray, filterArray, fn * fc * fh * fw / 8 * bytesOf(fdt)); break; case DF_NCHW: { /* diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A55.cpp b/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A55.cpp index 92ef5221..1178c32a 100644 --- a/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A55.cpp +++ b/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A55.cpp @@ -44,10 +44,10 @@ EE convolution_xnor_A55(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; if (fdf != DF_NCHWN16C8) { CHECK_STATUS(NOT_MATCH); diff --git a/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A76.cpp b/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A76.cpp index 52ae3a88..48ae960b 100644 --- a/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A76.cpp +++ b/compute/tensor/src/cpu/arm/bnn/convolution_xnor_A76.cpp @@ -44,10 +44,10 @@ EE convolution_xnor_A76(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; if (fdf != DF_NCHWN16C8) { CHECK_STATUS(NOT_MATCH); diff --git a/compute/tensor/src/cpu/arm/check.cpp b/compute/tensor/src/cpu/arm/check.cpp deleted file mode 100644 index e4e1ac81..00000000 --- a/compute/tensor/src/cpu/arm/check.cpp +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include "cpu/arm/tensor_computing_arm.h" -#include "arm_neon_expand.h" -#ifdef _USE_FP32 -#include "cpu/arm/fp32/tensor_computing_fp32.h" -#endif -#ifdef _USE_FP16 -#include "cpu/arm/fp16/tensor_computing_fp16.h" -#endif - -static EE check_u32(TensorDesc inputDescA, - const U32 *inputA, - TensorDesc inputDescB, - const U32 *inputB, - CheckMode checkMode, - TensorDesc outputDesc, - I32 *output) -{ - if (nullptr == inputA || nullptr == inputB || nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } - - if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) { - CHECK_STATUS(NOT_MATCH); - } - - U32 size = tensorNumElements(inputDescA); - U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1]; - if (tensorNumElements(outputDesc) != loopOuter) { - CHECK_STATUS(NOT_MATCH); - } - I32 length = size / loopOuter; - for (U32 j = 0; j < loopOuter; j++) { - const U32 *arrayA = inputA + j * length; - const U32 *arrayB = inputB + j * length; - switch (checkMode) { - case CHECK_EQUAL: { - uint32x4_t count_v = vdupq_n_u32(0); - I32 i = 0; - for (; i < length - 3; i += 4) { - uint32x4_t a = vld1q_u32(arrayA + i); - uint32x4_t b = vld1q_u32(arrayA + i); - count_v = vaddq_u32(count_v, vceqq_u32(a, b)); - } - I32 count = vaddvq_u32(count_v); - for (; i < length; i++) { - if (arrayA[i] == arrayB[i]) { - count++; - } - } - output[j] = (count == length); - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } - } - return SUCCESS; -} - -EE check_arm(TensorDesc inputDescA, - const void *inputA, - TensorDesc inputDescB, - const void *inputB, - CheckParamSpec p, - TensorDesc outputDesc, - void *output) -{ - DataType idt = inputDescA.dt; - EE ret = SUCCESS; - switch (idt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = check_fp32(inputDescA, (const F32 *)inputA, inputDescB, (const F32 *)inputB, - p.check_mode, outputDesc, (I32 *)output); - break; - } -#endif -#ifdef _USE_FP16 - case DT_F16: { - ret = check_fp16(inputDescA, (const F16 *)inputA, inputDescB, (const F16 *)inputB, - p.check_mode, outputDesc, (I32 *)output); - break; - } -#endif - case DT_U32: { - ret = check_u32(inputDescA, (const U32 *)inputA, inputDescB, (const U32 *)inputB, - p.check_mode, outputDesc, (I32 *)output); - break; - } - case DT_I32: { - ret = check_u32(inputDescA, (const U32 *)inputA, inputDescB, (const U32 *)inputB, - p.check_mode, outputDesc, (I32 *)output); - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - - return ret; -} diff --git a/compute/tensor/src/cpu/arm/convolution.cpp b/compute/tensor/src/cpu/arm/convolution.cpp index 2391fb0e..505a1f7b 100644 --- a/compute/tensor/src/cpu/arm/convolution.cpp +++ b/compute/tensor/src/cpu/arm/convolution.cpp @@ -60,7 +60,7 @@ EE convolution_infer_forward_algorithm_arm(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); it = ft = 1; p.dilatedRate_t = p.stride_t = 1; - p.padding_before = p.padding_after = 0; + p.pad_before = p.pad_after = 0; } else if (tensorIs5d(inputDesc)) { CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw)); CHECK_STATUS(tensor5dGet(filterDesc, &fdt, &fdf, &fn, &fc, &ft, &fh, &fw)); @@ -75,8 +75,8 @@ EE convolution_infer_forward_algorithm_arm(TensorDesc inputDesc, if ((idf != DF_NCHWC8 || ic / p.group % 8 != 0) && DT_I8 != idt) { *algorithm = CONVOLUTION_ALGORITHM_GEMM_ICNCHW; } else if (ft == 1 && fh == 3 && fw == 3 && p.stride_t == 1 && p.stride_h == 1 && - p.stride_w == 1 && p.padding_before == 0 && p.padding_after == 0 && p.padding_top == 1 && - p.padding_bottom == 1 && p.padding_left == 1 && p.padding_right == 1) { + p.stride_w == 1 && p.pad_before == 0 && p.pad_after == 0 && p.pad_top == 1 && + p.pad_bottom == 1 && p.pad_left == 1 && p.pad_right == 1) { *algorithm = CONVOLUTION_ALGORITHM_WINOGRAD; } else { *algorithm = CONVOLUTION_ALGORITHM_GEMM; @@ -141,7 +141,7 @@ EE convolution_infer_forward_algorithm_arm(TensorDesc inputDesc, CHECK_STATUS(convolution_transform_filter_arm( filterDesc, filter, p, convolutionAlgorithms[i], &ftmDesc, filterTransformed)); - memset(tmp, 0, tmpBytes); + UNI_MEMSET(tmp, 0, tmpBytes); double timeStart = ut_time_ms(); CHECK_STATUS(convolution_arm(inputDesc, input, ftmDesc, filterTransformed, p, convolutionAlgorithms[i], scaleDesc, scale, biasDesc, bias, tmpBytes, tmp, @@ -306,7 +306,7 @@ EE convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); it = ft = ot = 1; p.dilatedRate_t = p.stride_t = 1; - p.padding_before = p.padding_after = 0; + p.pad_before = p.pad_after = 0; } else if (tensorIs5d(inputDesc)) { CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw)); CHECK_STATUS(tensor5dGet(filterDesc, &fdt, &fdf, &fn, &fc, &ft, &fh, &fw)); @@ -314,9 +314,9 @@ EE convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, } else { return NOT_SUPPORTED; } - U32 it_pad = it + p.padding_before + p.padding_after; - U32 ih_pad = ih + p.padding_top + p.padding_bottom; - U32 iw_pad = iw + p.padding_left + p.padding_right; + U32 it_pad = it + p.pad_before + p.pad_after; + U32 ih_pad = ih + p.pad_top + p.pad_bottom; + U32 iw_pad = iw + p.pad_left + p.pad_right; U32 tile_size = 0; switch (fdt) { case DT_F32: @@ -360,10 +360,10 @@ EE convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, case CONVOLUTION_ALGORITHM_WINOGRAD: { U32 tile_h = (oh + 3) / 4; U32 tile_w = (ow + 3) / 4; - U32 pad_left = p.padding_left; - U32 pad_right = p.padding_right + (tile_w * 4 - ow); - U32 pad_top = p.padding_top; - U32 pad_bottom = p.padding_bottom + (tile_h * 4 - oh); + U32 pad_left = p.pad_left; + U32 pad_right = p.pad_right + (tile_w * 4 - ow); + U32 pad_top = p.pad_top; + U32 pad_bottom = p.pad_bottom + (tile_h * 4 - oh); ih_pad = ih + pad_top + pad_bottom; iw_pad = iw + pad_left + pad_right; *bytes = ic * ih_pad * iw_pad * element_size; diff --git a/compute/tensor/src/cpu/arm/deconvolution.cpp b/compute/tensor/src/cpu/arm/deconvolution.cpp index 09f7301e..db47a1cd 100644 --- a/compute/tensor/src/cpu/arm/deconvolution.cpp +++ b/compute/tensor/src/cpu/arm/deconvolution.cpp @@ -66,8 +66,8 @@ EE deconvolution_overlap_crop_arm_kernel(T *input, U32 fhfw = fh * fw; U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingL = convParamSpec.padding_left; + U32 paddingT = convParamSpec.pad_top; + U32 paddingL = convParamSpec.pad_left; for (U32 kn = 0; kn < in; ++kn) { #ifdef _USE_OPENMP #pragma omp parallel for num_threads(OMP_NUM_THREADS) @@ -96,7 +96,7 @@ EE deconvolution_overlap_crop_arm_kernel(T *input, } } output += oc * oh * ow; - input += ic * ih * iw; + input += oc * fh * fw * ih * iw; } return SUCCESS; diff --git a/compute/tensor/src/cpu/arm/depthwise_convolution.cpp b/compute/tensor/src/cpu/arm/depthwise_convolution.cpp index 3e0d5130..2affc5c1 100644 --- a/compute/tensor/src/cpu/arm/depthwise_convolution.cpp +++ b/compute/tensor/src/cpu/arm/depthwise_convolution.cpp @@ -41,7 +41,7 @@ EE depthwise_convolution_transform_filter_arm(TensorDesc filterDesc, ftmDesc->df = ftmDataFormat; EE ret = NOT_SUPPORTED; if (filterDesc.df == ftmDataFormat) { - memcpy(filterTransformed, filter, tensorNumBytes(filterDesc)); + UNI_MEMCPY(filterTransformed, filter, tensorNumBytes(filterDesc)); ret = SUCCESS; } else if (filterDesc.df == DF_NCHW) { if (ftmDataFormat == DF_NCHWC8) { @@ -69,10 +69,10 @@ EE depthwise_convolution_infer_forward_tmp_bytes_arm(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 ih_pad = ih + paddingT + paddingB; U32 iw_pad = iw + paddingL + paddingR; diff --git a/compute/tensor/src/cpu/arm/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/arm/depthwise_pointwise_convolution.cpp index 84e70dbf..bd8723ac 100644 --- a/compute/tensor/src/cpu/arm/depthwise_pointwise_convolution.cpp +++ b/compute/tensor/src/cpu/arm/depthwise_pointwise_convolution.cpp @@ -56,10 +56,10 @@ EE depthwise_pointwise_convolution_infer_forward_algorithm_arm(TensorDesc inputD case DT_F16: { U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; if (fh == 3 && fw == 3 && strideH == 1 && strideW == 1 && paddingT == 1 && paddingB == 1 && paddingL == 1 && paddingR == 1 && ow % 4 == 0 && ow >= 12) { @@ -112,10 +112,10 @@ EE depthwise_pointwise_convolution_infer_forward_tmp_bytes_arm(TensorDesc inputD U32 on, oc, oh, ow; CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 ih_pad = ih + paddingT + paddingB; U32 iw_pad = iw + paddingL + paddingR; diff --git a/compute/tensor/src/cpu/arm/fp16/arm_functions_fp16.h b/compute/tensor/src/cpu/arm/fp16/arm_functions_fp16.h index 9674a1c1..1bf628c4 100644 --- a/compute/tensor/src/cpu/arm/fp16/arm_functions_fp16.h +++ b/compute/tensor/src/cpu/arm/fp16/arm_functions_fp16.h @@ -14,11 +14,8 @@ #ifndef _H_ARM_FUNCTIONS_FP16 #define _H_ARM_FUNCTIONS_FP16 -#include +#include "cpu/cpu_functions_template.h" #include "arm_neon_expand.h" -#include "uni.h" -#include "data_type.h" -#include "parameter_spec.h" // array sum inline F32 array_sum_f16(const F16 *data, I32 len) @@ -237,7 +234,7 @@ inline void array_power_f16(F16 *input, F16 *output, I32 len, F32 power) #endif } else if (power == 1) { if (input != output) { - memcpy(output, input, len * sizeof(F16)); + UNI_MEMCPY(output, input, len * sizeof(F16)); } i = len; } else if (power == 2) { @@ -263,137 +260,110 @@ inline void array_power_f16(F16 *input, F16 *output, I32 len, F32 power) inline EE activation_fp16(F16 *input, U32 len, ActivationParamSpec activationDesc, F16 *output) { - float16x8_t in, out; float16x8_t zero = vdupq_n_f16(float16_t(0.)); float16x8_t one = vdupq_n_f16(float16_t(1.)); float16x8_t three = vdupq_n_f16(float16_t(3.)); float16x8_t six = vdupq_n_f16(float16_t(6.)); - U32 len_main = len / 8; - U32 len_tail = len % 8; - - F16 value; + U32 loops = len / 8 * 8; EE ret = SUCCESS; switch (activationDesc.mode) { case ACTIVATION_NULL: { + if (output != input) { + UNI_MEMCPY(output, input, sizeof(F16) * len); + } + loops = len; break; } case ACTIVATION_RELU: { if (activationDesc.value[0] == 0) { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); - out = vmaxq_f16(zero, in); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - output[i] = (input[i] < 0) ? 0 : input[i]; +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t out = vmaxq_f16(zero, in); + vst1q_f16(output + i, out); } } else { float16x8_t scale = vdupq_n_f16(activationDesc.value[0]); - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 8) { + float16x8_t in = vld1q_f16(input + i); float16x8_t tmp = vmulq_f16(scale, in); - out = vmaxq_f16(tmp, in); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - float tmp = activationDesc.value[0] * input[i]; - output[i] = (input[i] < tmp) ? tmp : input[i]; + float16x8_t out = vmaxq_f16(tmp, in); + vst1q_f16(output + i, out); } } break; } case ACTIVATION_RELU6: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); - out = vmaxq_f16(zero, in); +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t out = vmaxq_f16(zero, in); out = vminq_f16(six, out); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = (input[i] < 0) ? 0 : input[i]; - if (value > 6) { - value = 6; - } - output[i] = value; + vst1q_f16(output + i, out); } break; } case ACTIVATION_H_SIGMOID: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); - out = vaddq_f16(in, three); +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + + for (U32 i = 0; i < loops; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t out = vaddq_f16(in, three); out = vmaxq_f16(out, zero); out = vminq_f16(out, six); out = vdivq_f16(out, six); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i] + 3; - value = (value < 0) ? 0 : value; - value = (value > 6) ? 6 : value; - value = value / 6; - output[i] = value; + vst1q_f16(output + i, out); } break; } case ACTIVATION_H_SWISH: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); - out = vaddq_f16(in, three); +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t out = vaddq_f16(in, three); out = vmaxq_f16(out, zero); out = vminq_f16(out, six); out = vdivq_f16(out, six); out = vmulq_f16(out, in); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i] + 3; - value = (value < 0) ? 0 : value; - value = (value > 6) ? 6 : value; - value = input[i] * value; - value = value / 6; - output[i] = value; + vst1q_f16(output + i, out); } break; } case ACTIVATION_H_SWISH_NODIV: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); - out = vaddq_f16(in, three); +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t out = vaddq_f16(in, three); out = vmaxq_f16(out, zero); out = vminq_f16(out, six); out = vmulq_f16(out, in); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i] + 3; - value = (value < 0) ? 0 : value; - value = (value > 6) ? 6 : value; - value = input[i] * value; - output[i] = value; + vst1q_f16(output + i, out); } break; } case ACTIVATION_GELU: { - F16 two_div_PI_sqrt = sqrt(2 / 3.14159265358979323846); - float16x8_t vec0 = vdupq_n_f16(two_div_PI_sqrt); + float16x8_t vec0 = vdupq_n_f16(sqrt(2 / 3.14159265358979323846)); float16x8_t vec1 = vdupq_n_f16(float16_t(0.044715)); float16x8_t vec2 = vdupq_n_f16(float16_t(0.5)); - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); - out = vmulq_f16(in, in); +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t out = vmulq_f16(in, in); out = vmulq_f16(out, in); out = vfmaq_f16(in, vec1, out); out = vmulq_f16(vec0, out); @@ -401,136 +371,122 @@ inline EE activation_fp16(F16 *input, U32 len, ActivationParamSpec activationDes out = vaddq_f16(one, out); out = vmulq_f16(vec2, out); out = vmulq_f16(in, out); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i]; - value = two_div_PI_sqrt * (value + 0.044715 * powf(value, 3)); - value = 1.0 - 2.0 / (exp(2.0 * value) + 1.0); - value = 0.5 * (1.0 + value); - value = input[i] * value; - output[i] = value; + vst1q_f16(output + i, out); } break; } case ACTIVATION_TANH: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); - out = vtanhq_f16(in); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = 1.0 - 2.0 / (exp(2.0 * input[i]) + 1.0); - output[i] = value; +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t out = vtanhq_f16(in); + vst1q_f16(output + i, out); } break; } case ACTIVATION_SIGMOID: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); - out = vsigmoidq_f16(in); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = 1.0 / (1.0 + exp(-1.0 * input[i])); - output[i] = value; +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t out = vsigmoidq_f16(in); + vst1q_f16(output + i, out); } break; } - case ACTIVATION_MISH: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); - out = vmulq_f16( - in, vtanhq_f16(vlogq_f16(vaddq_f16(vexpq_f16_03_percent_error(in), one)))); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i] * tanh(log(exp(input[i]) + 1.0)); - output[i] = value; + case ACTIVATION_SWISH: { +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t out = vmulq_f16(in, vsigmoidq_f16(in)); + vst1q_f16(output + i, out); } break; } - case ACTIVATION_GREATER: { - for (U32 i = 0; i < len; i++) { - output[i] = input[i] > 1 ? 1 : 0; + case ACTIVATION_MISH: { +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t out = vmulq_f16( + in, vtanhq_f16(vlogq_f16(vaddq_f16(vexpq_f16_03_percent_error(in), one)))); + vst1q_f16(output + i, out); } break; } case ACTIVATION_SOFTPLUS: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); - out = vlogq_f16(vaddq_f16(vexpq_f16_03_percent_error(in), one)); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - output[i] = log(1 + exp(input[i])); +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t out = vlogq_f16(vaddq_f16(vexpq_f16_03_percent_error(in), one)); + vst1q_f16(output + i, out); } break; } case ACTIVATION_EXP: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); - out = vexpq_f16_03_percent_error(in); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - output[i] = exp(input[i]); +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t out = vexpq_f16_03_percent_error(in); + vst1q_f16(output + i, out); } break; } case ACTIVATION_ABS: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f16(input); - out = vabsq_f16(in); - vst1q_f16(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - output[i] = UNI_ABS(input[i]); - } - break; - } - case ACTIVATION_SIGN: { - for (U32 i = 0; i < len; i++) { - output[i] = UNI_SIGN(input[i]); - } - break; - } - case ACTIVATION_LOG: { - for (U32 i = 0; i < len; i++) { - output[i] = log(input[i]); +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t out = vabsq_f16(in); + vst1q_f16(output + i, out); } break; } - case ACTIVATION_NOT: { - for (U32 i = 0; i < len; i++) { - output[i] = (input[i] > 0) ? 0 : 1; + case ACTIVATION_RECIPROCAL: { +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 8) { + float16x8_t in = vld1q_f16(input + i); + float16x8_t out = vdivq_f16(one, in); + vst1q_f16(output + i, out); } break; } - case ACTIVATION_NEG: { - for (U32 i = 0; i < len; i++) { - output[i] = -input[i]; - } + case ACTIVATION_SIGN: + case ACTIVATION_LOG: + case ACTIVATION_NOT: + case ACTIVATION_GREATER: + case ACTIVATION_NEG: + case ACTIVATION_ROUND: + case ACTIVATION_CEIL: + case ACTIVATION_FLOOR: { + loops = 0; break; } default: ret = NOT_SUPPORTED; break; } + if (ret == SUCCESS) { +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = loops; i < len; i++) { + ret = activation_template(activationDesc, input[i], output + i); + } + } return ret; } diff --git a/compute/tensor/src/cpu/arm/fp16/attention.cpp b/compute/tensor/src/cpu/arm/fp16/attention.cpp index 050203ab..1b22260b 100644 --- a/compute/tensor/src/cpu/arm/fp16/attention.cpp +++ b/compute/tensor/src/cpu/arm/fp16/attention.cpp @@ -11,7 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include #include "cpu/arm/fp16/tensor_computing_fp16.h" EE attention_fp16(U32 batch, @@ -26,14 +25,14 @@ EE attention_fp16(U32 batch, } F16 mask_s = -10000.0; - I32 count = array_sum_f16(input, toSequenceLength); - I32 valid = UNI_MIN(count, fromSequenceLength); float16x8_t mask_v = vdupq_n_f16(float16_t(mask_s)); float16x8_t one_v = vdupq_n_f16(float16_t(1.0)); for (U32 n = 0; n < batch; n++) { + U32 count = array_sum_f16(input, toSequenceLength); + U32 valid = UNI_MIN(count, (U32)fromSequenceLength); for (U32 i = 0; i < numHeads; i++) { if (i == 0) { - for (I32 j = 0; j < valid; j++) { + for (U32 j = 0; j < valid; j++) { if (j == 0) { I32 k = 0; for (; k < toSequenceLength - 7; k += 8) { @@ -47,12 +46,12 @@ EE attention_fp16(U32 batch, output[k] = value; } } else { - memcpy( + UNI_MEMCPY( output + j * toSequenceLength, output, toSequenceLength * sizeof(F16)); } } - for (I32 j = valid; j < fromSequenceLength; j++) { + for (U32 j = valid; j < (U32)fromSequenceLength; j++) { if (j == valid) { I32 k = 0; for (; k < toSequenceLength - 7; k += 8) { @@ -62,12 +61,12 @@ EE attention_fp16(U32 batch, output[j * toSequenceLength + k] = mask_s; } } else { - memcpy(output + j * toSequenceLength, output + valid * toSequenceLength, + UNI_MEMCPY(output + j * toSequenceLength, output + valid * toSequenceLength, toSequenceLength * sizeof(F16)); } } } else { - memcpy(output + i * fromSequenceLength * toSequenceLength, output, + UNI_MEMCPY(output + i * fromSequenceLength * toSequenceLength, output, fromSequenceLength * toSequenceLength * sizeof(F16)); } } diff --git a/compute/tensor/src/cpu/arm/fp16/attention_mask.cpp b/compute/tensor/src/cpu/arm/fp16/attention_mask.cpp index afad68e5..564e5db0 100644 --- a/compute/tensor/src/cpu/arm/fp16/attention_mask.cpp +++ b/compute/tensor/src/cpu/arm/fp16/attention_mask.cpp @@ -11,7 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include #include "cpu/arm/fp16/tensor_computing_fp16.h" EE attention_mask_fp16(TensorDesc inputDesc, @@ -56,7 +55,7 @@ EE attention_mask_fp16(TensorDesc inputDesc, if (start + loops > klen) { loops = UNI_MAX(klen - start, 0); } - memset(&mask[i * klen + start], 0, sizeof(F16) * loops); + UNI_MEMSET(&mask[i * klen + start], 0, sizeof(F16) * loops); } } I32 loops = tensorNumElements(inputDesc) / length; diff --git a/compute/tensor/src/cpu/arm/fp16/check.cpp b/compute/tensor/src/cpu/arm/fp16/check.cpp deleted file mode 100644 index 139677cd..00000000 --- a/compute/tensor/src/cpu/arm/fp16/check.cpp +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include "cpu/arm/fp16/tensor_computing_fp16.h" - -EE check_fp16(TensorDesc inputDescA, - const F16 *inputA, - TensorDesc inputDescB, - const F16 *inputB, - CheckMode checkMode, - TensorDesc outputDesc, - I32 *output) -{ - if (nullptr == inputA || nullptr == inputB || nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } - - if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) { - CHECK_STATUS(NOT_MATCH); - } - - U32 size = tensorNumElements(inputDescA); - U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1]; - I32 length = size / loopOuter; - if (tensorNumElements(outputDesc) != loopOuter) { - CHECK_STATUS(NOT_MATCH); - } - for (U32 j = 0; j < loopOuter; j++) { - const F16 *arrayA = inputA + j * length; - const F16 *arrayB = inputB + j * length; - switch (checkMode) { - case CHECK_GREAT: { - uint16x8_t count_v = vdupq_n_u16(0); - I32 i = 0; - for (; i < length - 7; i += 8) { - float16x8_t a = vld1q_f16(arrayA + i); - float16x8_t b = vld1q_f16(arrayA + i); - count_v = vaddq_u16(count_v, vcgtq_f16(a, b)); - } - I32 count = vaddvq_u16(count_v); - for (; i < length; i++) { - if (arrayA[i] > arrayB[i]) { - count++; - } - } - output[j] = (count == length); - break; - } - case CHECK_GREATEQUAL: { - uint16x8_t count_v = vdupq_n_u16(0); - I32 i = 0; - for (; i < length - 7; i += 8) { - float16x8_t a = vld1q_f16(arrayA + i); - float16x8_t b = vld1q_f16(arrayA + i); - count_v = vaddq_u16(count_v, vcgeq_f16(a, b)); - } - I32 count = vaddvq_u16(count_v); - for (; i < length; i++) { - if (arrayA[i] >= arrayB[i]) { - count++; - } - } - output[j] = (count == length); - break; - } - case CHECK_EQUAL: { - uint16x8_t count_v = vdupq_n_u16(0); - I32 i = 0; - for (; i < length - 7; i += 8) { - float16x8_t a = vld1q_f16(arrayA + i); - float16x8_t b = vld1q_f16(arrayA + i); - count_v = vaddq_u16(count_v, vceqq_f16(a, b)); - } - I32 count = vaddvq_u16(count_v); - for (; i < length; i++) { - if (arrayA[i] == arrayB[i]) { - count++; - } - } - output[j] = (count == length); - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } - } - return SUCCESS; -} diff --git a/compute/tensor/src/cpu/arm/fp16/clip.cpp b/compute/tensor/src/cpu/arm/fp16/clip.cpp index 3f19ae9e..d9b63e61 100644 --- a/compute/tensor/src/cpu/arm/fp16/clip.cpp +++ b/compute/tensor/src/cpu/arm/fp16/clip.cpp @@ -21,14 +21,15 @@ EE clip_fp16(F16 *input, F16 *output, I32 len, F32 minValue, F32 maxValue) float16x8_t min_v = vdupq_n_f16(minValue); float16x8_t max_v = vdupq_n_f16(maxValue); - - I32 i = 0; - for (i = 0; i < len - 7; i += 8) { +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (int i = 0; i < len - 7; i += 8) { float16x8_t in = vld1q_f16(input + i); float16x8_t tmp_v = vminq_f16(max_v, vmaxq_f16(min_v, in)); vst1q_f16(output + i, tmp_v); } - for (; i < len; i++) { + for (int i = len / 8 * 8; i < len; i++) { F16 value = input[i]; value = (value > minValue) ? value : minValue; value = (value < maxValue) ? value : maxValue; diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_direct.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_direct.cpp index 3782db73..b356e8b6 100644 --- a/compute/tensor/src/cpu/arm/fp16/convolution_direct.cpp +++ b/compute/tensor/src/cpu/arm/fp16/convolution_direct.cpp @@ -11,8 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include - #include "cpu/arm/fp16/convolution_direct.h" EE convolution_direct(TensorDesc inputDesc, @@ -43,10 +41,10 @@ EE convolution_direct(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; if (fdf != DF_NCHWN16) { CHECK_STATUS(NOT_MATCH); @@ -67,20 +65,20 @@ EE convolution_direct(TensorDesc inputDesc, F16 *inArray_mov = inArray + n * ic * ih * iw * 8; for (U32 c = 0; c < ic; c++) { for (U32 h = 0; h < paddingT; h++) { - memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + UNI_MEMSET(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); inArray_pad_mov += iw_pad * 8; } for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt)); inArray_pad_mov += paddingL * 8; - memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + UNI_MEMCPY(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); inArray_pad_mov += iw * 8; inArray_mov += iw * 8; - memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt)); inArray_pad_mov += paddingR * 8; } for (U32 h = ih_pad - paddingB; h < ih_pad; h++) { - memset(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); + UNI_MEMSET(inArray_pad_mov, 0, iw_pad * 8 * bytesOf(idt)); inArray_pad_mov += iw_pad * 8; } } @@ -400,10 +398,9 @@ EE convolution_direct(TensorDesc inputDesc, : [in_h0w0] "r"(in_h0w0), [in_h0w1] "r"(in_h0w1), [in_h0w2] "r"(in_h0w2), [in_h0w3] "r"(in_h0w3), [in_h1w0] "r"(in_h1w0), [in_h1w1] "r"(in_h1w1), - [in_h1w2] "r"(in_h1w2), [in_h1w3] "r"(in_h1w3), - [f_c0] "r"(f_c0), [f_c1] "r"(f_c1), [f_c2] "r"(f_c2), - [f_c3] "r"(f_c3), [f_c4] "r"(f_c4), [f_c5] "r"(f_c5), - [f_c6] "r"(f_c6), [f_c7] "r"(f_c7) + [in_h1w2] "r"(in_h1w2), [in_h1w3] "r"(in_h1w3), [f_c0] "r"(f_c0), + [f_c1] "r"(f_c1), [f_c2] "r"(f_c2), [f_c3] "r"(f_c3), + [f_c4] "r"(f_c4), [f_c5] "r"(f_c5), [f_c6] "r"(f_c6), [f_c7] "r"(f_c7) : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A55.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A55.cpp index 0ac90160..9e39bf6e 100644 --- a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A55.cpp +++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A55.cpp @@ -42,7 +42,7 @@ EE convolution_gemm_A55(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); it = ft = ot = 1; p.dilatedRate_t = p.stride_t = 1; - p.padding_before = p.padding_after = 0; + p.pad_before = p.pad_after = 0; } else if (tensorIs5d(inputDesc)) { CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw)); CHECK_STATUS(tensor5dGet(filterDesc, &fdt, &fdf, &fn, &fc, &ft, &fh, &fw)); @@ -56,9 +56,9 @@ EE convolution_gemm_A55(TensorDesc inputDesc, } oc /= 8; - U32 it_pad = it + p.padding_before + p.padding_after; - U32 ih_pad = ih + p.padding_top + p.padding_bottom; - U32 iw_pad = iw + p.padding_left + p.padding_right; + U32 it_pad = it + p.pad_before + p.pad_after; + U32 ih_pad = ih + p.pad_top + p.pad_bottom; + U32 iw_pad = iw + p.pad_left + p.pad_right; I64 K = ic * ft * fh * fw; I32 ohow = ot * oh * ow; int oc_1 = oc - 1; diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A76.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A76.cpp index 5aa70b33..d89e4c0f 100644 --- a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A76.cpp +++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_A76.cpp @@ -42,7 +42,7 @@ EE convolution_gemm_A76(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); it = ft = ot = 1; p.dilatedRate_t = p.stride_t = 1; - p.padding_before = p.padding_after = 0; + p.pad_before = p.pad_after = 0; } else if (tensorIs5d(inputDesc)) { CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw)); CHECK_STATUS(tensor5dGet(filterDesc, &fdt, &fdf, &fn, &fc, &ft, &fh, &fw)); @@ -56,9 +56,9 @@ EE convolution_gemm_A76(TensorDesc inputDesc, } oc /= 8; - U32 it_pad = it + p.padding_before + p.padding_after; - U32 ih_pad = ih + p.padding_top + p.padding_bottom; - U32 iw_pad = iw + p.padding_left + p.padding_right; + U32 it_pad = it + p.pad_before + p.pad_after; + U32 ih_pad = ih + p.pad_top + p.pad_bottom; + U32 iw_pad = iw + p.pad_left + p.pad_right; I64 K = ic * ft * fh * fw; I32 ohow = ot * oh * ow; int oc_1 = oc - 1; diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw.h b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw.h index e6573d8b..4ee41a70 100644 --- a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw.h +++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw.h @@ -14,7 +14,6 @@ #ifndef _H_CONVOLUTION_GEMM_ICNCHW #define _H_CONVOLUTION_GEMM_ICNCHW -#include #include "sys.h" #include "tensor_desc.h" #include "parameter_spec.h" diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A55.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A55.cpp index d784f644..1fbd7293 100644 --- a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A55.cpp +++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A55.cpp @@ -42,7 +42,7 @@ EE convolution_gemm_icnchw_A55(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); it = ft = ot = 1; p.dilatedRate_t = p.stride_t = 1; - p.padding_before = p.padding_after = 0; + p.pad_before = p.pad_after = 0; } else if (tensorIs5d(inputDesc)) { CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw)); CHECK_STATUS(tensor5dGet(filterDesc, &fdt, &fdf, &fn, &fc, &ft, &fh, &fw)); @@ -56,9 +56,9 @@ EE convolution_gemm_icnchw_A55(TensorDesc inputDesc, } oc /= 8; - U32 it_pad = it + p.padding_before + p.padding_after; - U32 ih_pad = ih + p.padding_top + p.padding_bottom; - U32 iw_pad = iw + p.padding_left + p.padding_right; + U32 it_pad = it + p.pad_before + p.pad_after; + U32 ih_pad = ih + p.pad_top + p.pad_bottom; + U32 iw_pad = iw + p.pad_left + p.pad_right; I64 K = ic * ft * fh * fw; I32 ohow = ot * oh * ow; int oc_1 = oc - 1; diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A76.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A76.cpp index 30eca92b..20418bbe 100644 --- a/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A76.cpp +++ b/compute/tensor/src/cpu/arm/fp16/convolution_gemm_icnchw_A76.cpp @@ -42,7 +42,7 @@ EE convolution_gemm_icnchw_A76(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); it = ft = ot = 1; p.dilatedRate_t = p.stride_t = 1; - p.padding_before = p.padding_after = 0; + p.pad_before = p.pad_after = 0; } else if (tensorIs5d(inputDesc)) { CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw)); CHECK_STATUS(tensor5dGet(filterDesc, &fdt, &fdf, &fn, &fc, &ft, &fh, &fw)); @@ -56,9 +56,9 @@ EE convolution_gemm_icnchw_A76(TensorDesc inputDesc, } oc /= 8; - U32 it_pad = it + p.padding_before + p.padding_after; - U32 ih_pad = ih + p.padding_top + p.padding_bottom; - U32 iw_pad = iw + p.padding_left + p.padding_right; + U32 it_pad = it + p.pad_before + p.pad_after; + U32 ih_pad = ih + p.pad_top + p.pad_bottom; + U32 iw_pad = iw + p.pad_left + p.pad_right; I64 K = ic * ft * fh * fw; I32 ohow = ot * oh * ow; int oc_1 = oc - 1; diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_transform.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_transform.cpp index 172dd435..480e5e9b 100644 --- a/compute/tensor/src/cpu/arm/fp16/convolution_transform.cpp +++ b/compute/tensor/src/cpu/arm/fp16/convolution_transform.cpp @@ -25,7 +25,7 @@ static EE convolution_transform_filter_kernel_fp16(TensorDesc filterDesc, } if (filterDesc.df == ftmDataFormat) { *ftmDesc = filterDesc; - memcpy(ftmArray, filterArray, tensorNumBytes(filterDesc)); + UNI_MEMCPY(ftmArray, filterArray, tensorNumBytes(filterDesc)); return SUCCESS; } if (filterDesc.df != DF_NCHW) { diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A55.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A55.cpp index 811bdc6e..6b562f58 100644 --- a/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A55.cpp +++ b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A55.cpp @@ -40,10 +40,10 @@ EE convolution_winograd_A55(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; if (fdf != DF_HWNCN16) { CHECK_STATUS(NOT_MATCH); @@ -77,8 +77,8 @@ EE convolution_winograd_A55(TensorDesc inputDesc, int oc_1 = oc - 1; // copy input into a input with padding for (U32 n = 0; n < in; n++) { - convParamSpec.padding_bottom = pad_bottom; - convParamSpec.padding_right = pad_right; + convParamSpec.pad_bottom = pad_bottom; + convParamSpec.pad_right = pad_right; F16 *inArray_pad = convolution_input_padding_per_channel( n, ic, 1, ih, iw, convParamSpec, inArray, (F16 *)tmp); @@ -454,15 +454,15 @@ EE convolution_winograd_A55(TensorDesc inputDesc, // itm[c8*4 + 3] = Iw3[i][c8]; // } - __asm__ __volatile__("ldr q0, [%[in_0]]\n" - "ldr q1, [%[in_1]]\n" - "ldr q2, [%[in_2]]\n" - "ldr q3, [%[in_3]]\n" - "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[itm]]\n" - : [itm] "+r"(itm) - : [in_0] "r"(Iw0[i]), [in_1] "r"(Iw1[i]), - [in_2] "r"(Iw2[i]), [in_3] "r"(Iw3[i]) - : "memory", "cc", "v0", "v1", "v2", "v3"); + __asm__ __volatile__( + "ldr q0, [%[in_0]]\n" + "ldr q1, [%[in_1]]\n" + "ldr q2, [%[in_2]]\n" + "ldr q3, [%[in_3]]\n" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[itm]]\n" + : [itm] "+r"(itm) + : [in_0] "r"(Iw0[i]), [in_1] "r"(Iw1[i]), [in_2] "r"(Iw2[i]), [in_3] "r"(Iw3[i]) + : "memory", "cc", "v0", "v1", "v2", "v3"); } } for (I32 o = 0; o < oc_1; o += 2) { @@ -663,7 +663,7 @@ EE convolution_winograd_A55(TensorDesc inputDesc, // for (U32 c8 = 0; c8 < 8; c8++) { // itm[c8] = Iw0[i][c8]; // } - memcpy(itm, Iw0[i], 8 * bytesOf(idt)); + UNI_MEMCPY(itm, Iw0[i], 8 * bytesOf(idt)); } } for (I32 o = 0; o < oc_1; o += 2) { diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A76.cpp b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A76.cpp index 852bcb41..ff799909 100644 --- a/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A76.cpp +++ b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_A76.cpp @@ -40,10 +40,10 @@ EE convolution_winograd_A76(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; if (fdf != DF_HWNCN16) { CHECK_STATUS(NOT_MATCH); @@ -78,8 +78,8 @@ EE convolution_winograd_A76(TensorDesc inputDesc, EE ret = SUCCESS; // copy input into a input with padding for (U32 n = 0; n < in; n++) { - convParamSpec.padding_bottom = pad_bottom; - convParamSpec.padding_right = pad_right; + convParamSpec.pad_bottom = pad_bottom; + convParamSpec.pad_right = pad_right; F16 *inArray_pad = convolution_input_padding_per_channel( n, ic, 1, ih, iw, convParamSpec, inArray, (F16 *)tmp); @@ -413,15 +413,15 @@ EE convolution_winograd_A76(TensorDesc inputDesc, // itm[c8*4 + 2] = Iw2[i][c8]; // itm[c8*4 + 3] = Iw3[i][c8]; // } - __asm__ __volatile__("ldr q0, [%[in_0]]\n" - "ldr q1, [%[in_1]]\n" - "ldr q2, [%[in_2]]\n" - "ldr q3, [%[in_3]]\n" - "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[itm]]\n" - : [itm] "+r"(itm) - : [in_0] "r"(Iw0[i]), [in_1] "r"(Iw1[i]), - [in_2] "r"(Iw2[i]), [in_3] "r"(Iw3[i]) - : "memory", "cc", "v0", "v1", "v2", "v3"); + __asm__ __volatile__( + "ldr q0, [%[in_0]]\n" + "ldr q1, [%[in_1]]\n" + "ldr q2, [%[in_2]]\n" + "ldr q3, [%[in_3]]\n" + "st4 {v0.8h, v1.8h, v2.8h, v3.8h}, [%[itm]]\n" + : [itm] "+r"(itm) + : [in_0] "r"(Iw0[i]), [in_1] "r"(Iw1[i]), [in_2] "r"(Iw2[i]), [in_3] "r"(Iw3[i]) + : "memory", "cc", "v0", "v1", "v2", "v3"); } } for (I32 o = 0; o < oc_1; o += 2) { @@ -603,7 +603,7 @@ EE convolution_winograd_A76(TensorDesc inputDesc, // for (U32 c8 = 0; c8 < 8; c8++) { // itm[c8] = Iw0[i][c8]; // } - memcpy(itm, Iw0[i], 8 * bytesOf(idt)); + UNI_MEMCPY(itm, Iw0[i], 8 * bytesOf(idt)); } } for (I32 o = 0; o < oc_1; o += 2) { diff --git a/compute/tensor/src/cpu/arm/fp16/convolution_winograd_transform.h b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_transform.h index 6580f66e..f9c3ee43 100644 --- a/compute/tensor/src/cpu/arm/fp16/convolution_winograd_transform.h +++ b/compute/tensor/src/cpu/arm/fp16/convolution_winograd_transform.h @@ -14,7 +14,7 @@ #ifndef _H_WINOGRAD_TRANSFORM #define _H_WINOGRAD_TRANSFORM -#include +#include #include "cpu/arm/fp16/arm_functions_fp16.h" inline void trans_W_4x4_3x3(F16 *Fw[36], F16 *const F[9]) @@ -297,22 +297,22 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36]) vst1q_f16(Iw[i * 6 + 4], v_Iw4); vst1q_f16(Iw[i * 6 + 5], v_Iw5); } else { - F16 max = vmaxvq_f16(v_Iw0); - F16 min = vminvq_f16(v_Iw0); - if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { + F32 max = vmaxvq_f16(v_Iw0); + F32 min = vminvq_f16(v_Iw0); + if (isnan(max) || isinf(max) || isnan(min) || isinf(min)) { F16 check[8]; vst1q_f16(check, v_Iw0); for (U32 c = 0; c < 8; c++) { - F16 tmp = check[c]; - if (UNI_ISINF(tmp)) { + F32 tmp = check[c]; + if (isinf(tmp)) { if (tmp > 0) { check[c] = 65504; // FMAX for F16 } else { check[c] = -65504; } - } else if (UNI_ISNAN(tmp)) { + } else if (isnan(tmp)) { tmp = (T[i][0][c] - T[i][2][c]) * 4; - if (UNI_ISINF(tmp)) { + if (isinf(tmp)) { if (tmp > 0) { tmp = 65504; // FMAX for F16 } else { @@ -321,7 +321,7 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36]) } F16 diff = T[i][4][c] - T[i][2][c]; tmp += diff; - if (UNI_ISINF(tmp)) { + if (isinf(tmp)) { if (diff > 0) { tmp = 65504; } else { @@ -331,27 +331,27 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36]) check[c] = tmp; } } - memcpy(Iw[i * 6 + 0], check, 8 * bytesOf(DT_F16)); + UNI_MEMCPY(Iw[i * 6 + 0], check, 8 * bytesOf(DT_F16)); } else { vst1q_f16(Iw[i * 6 + 0], v_Iw0); } max = vmaxvq_f16(v_Iw1); min = vminvq_f16(v_Iw1); - if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { + if (isnan(max) || isinf(max) || isnan(min) || isinf(min)) { F16 check[8]; vst1q_f16(check, v_Iw1); for (U32 c = 0; c < 8; c++) { - F16 tmp = check[c]; - if (UNI_ISINF(tmp)) { + F32 tmp = check[c]; + if (isinf(tmp)) { if (tmp > 0) { check[c] = 65504; // FMAX for F16 } else { check[c] = -65504; } - } else if (UNI_ISNAN(tmp)) { + } else if (isnan(tmp)) { tmp = (T[i][1][c] + T[i][2][c]) * -4; - if (UNI_ISINF(tmp)) { + if (isinf(tmp)) { if (tmp > 0) { tmp = 65504; // FMAX for F16 } else { @@ -360,7 +360,7 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36]) } F16 sum = T[i][3][c] + T[i][4][c]; tmp += sum; - if (UNI_ISINF(tmp)) { + if (isinf(tmp)) { if (sum > 0) { tmp = 65504; } else { @@ -370,27 +370,27 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36]) check[c] = tmp; } } - memcpy(Iw[i * 6 + 1], check, 8 * bytesOf(DT_F16)); + UNI_MEMCPY(Iw[i * 6 + 1], check, 8 * bytesOf(DT_F16)); } else { vst1q_f16(Iw[i * 6 + 1], v_Iw1); } max = vmaxvq_f16(v_Iw2); min = vminvq_f16(v_Iw2); - if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { + if (isnan(max) || isinf(max) || isnan(min) || isinf(min)) { F16 check[8]; vst1q_f16(check, v_Iw2); for (U32 c = 0; c < 8; c++) { - F16 tmp = check[c]; - if (UNI_ISINF(tmp)) { + F32 tmp = check[c]; + if (isinf(tmp)) { if (tmp > 0) { check[c] = 65504; // FMAX for F16 } else { check[c] = -65504; } - } else if (UNI_ISNAN(tmp)) { + } else if (isnan(tmp)) { tmp = (T[i][1][c] - T[i][2][c]) * 4; - if (UNI_ISINF(tmp)) { + if (isinf(tmp)) { if (tmp > 0) { tmp = 65504; // FMAX for F16 } else { @@ -399,7 +399,7 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36]) } F16 diff = T[i][4][c] - T[i][3][c]; tmp += diff; - if (UNI_ISINF(tmp)) { + if (isinf(tmp)) { if (diff > 0) { tmp = 65504; } else { @@ -409,27 +409,27 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36]) check[c] = tmp; } } - memcpy(Iw[i * 6 + 2], check, 8 * bytesOf(DT_F16)); + UNI_MEMCPY(Iw[i * 6 + 2], check, 8 * bytesOf(DT_F16)); } else { vst1q_f16(Iw[i * 6 + 2], v_Iw2); } max = vmaxvq_f16(v_Iw3); min = vminvq_f16(v_Iw3); - if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { + if (isnan(max) || isinf(max) || isnan(min) || isinf(min)) { F16 check[8]; vst1q_f16(check, v_Iw3); for (U32 c = 0; c < 8; c++) { - F16 tmp = check[c]; - if (UNI_ISINF(tmp)) { + F32 tmp = check[c]; + if (isinf(tmp)) { if (tmp > 0) { check[c] = 65504; // FMAX for F16 } else { check[c] = -65504; } - } else if (UNI_ISNAN(tmp)) { + } else if (isnan(tmp)) { tmp = (T[i][3][c] - T[i][1][c]) * 2; - if (UNI_ISINF(tmp)) { + if (isinf(tmp)) { if (tmp > 0) { tmp = 65504; // FMAX for F16 } else { @@ -438,7 +438,7 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36]) } F16 diff = T[i][4][c] - T[i][2][c]; tmp += diff; - if (UNI_ISINF(tmp)) { + if (isinf(tmp)) { if (diff > 0) { tmp = 65504; } else { @@ -448,27 +448,27 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36]) check[c] = tmp; } } - memcpy(Iw[i * 6 + 3], check, 8 * bytesOf(DT_F16)); + UNI_MEMCPY(Iw[i * 6 + 3], check, 8 * bytesOf(DT_F16)); } else { vst1q_f16(Iw[i * 6 + 3], v_Iw3); } max = vmaxvq_f16(v_Iw4); min = vminvq_f16(v_Iw4); - if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { + if (isnan(max) || isinf(max) || isnan(min) || isinf(min)) { F16 check[8]; vst1q_f16(check, v_Iw4); for (U32 c = 0; c < 8; c++) { - F16 tmp = check[c]; - if (UNI_ISINF(tmp)) { + F32 tmp = check[c]; + if (isinf(tmp)) { if (tmp > 0) { check[c] = 65504; // FMAX for F16 } else { check[c] = -65504; } - } else if (UNI_ISNAN(tmp)) { + } else if (isnan(tmp)) { tmp = (T[i][1][c] - T[i][3][c]) * 2; - if (UNI_ISINF(tmp)) { + if (isinf(tmp)) { if (tmp > 0) { tmp = 65504; // FMAX for F16 } else { @@ -477,7 +477,7 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36]) } F16 diff = T[i][4][c] - T[i][2][c]; tmp += diff; - if (UNI_ISINF(tmp)) { + if (isinf(tmp)) { if (diff > 0) { tmp = 65504; } else { @@ -487,27 +487,27 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36]) check[c] = tmp; } } - memcpy(Iw[i * 6 + 4], check, 8 * bytesOf(DT_F16)); + UNI_MEMCPY(Iw[i * 6 + 4], check, 8 * bytesOf(DT_F16)); } else { vst1q_f16(Iw[i * 6 + 4], v_Iw4); } max = vmaxvq_f16(v_Iw5); min = vminvq_f16(v_Iw5); - if (UNI_ISNAN(max) || UNI_ISINF(max) || UNI_ISNAN(min) || UNI_ISINF(min)) { + if (isnan(max) || isinf(max) || isnan(min) || isinf(min)) { F16 check[8]; vst1q_f16(check, v_Iw5); for (U32 c = 0; c < 8; c++) { - F16 tmp = check[c]; - if (UNI_ISINF(tmp)) { + F32 tmp = check[c]; + if (isinf(tmp)) { if (tmp > 0) { check[c] = 65504; // FMAX for F16 } else { check[c] = -65504; } - } else if (UNI_ISNAN(tmp)) { + } else if (isnan(tmp)) { tmp = (T[i][1][c] - T[i][3][c]) * 4; - if (UNI_ISINF(tmp)) { + if (isinf(tmp)) { if (tmp > 0) { tmp = 65504; // FMAX for F16 } else { @@ -516,7 +516,7 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36]) } F16 diff = T[i][5][c] - T[i][3][c]; tmp += diff; - if (UNI_ISINF(tmp)) { + if (isinf(tmp)) { if (diff > 0) { tmp = 65504; } else { @@ -526,7 +526,7 @@ inline void trans_I_4x4_3x3(F16 *Iw[36], F16 *const I[36]) check[c] = tmp; } } - memcpy(Iw[i * 6 + 5], check, 8 * bytesOf(DT_F16)); + UNI_MEMCPY(Iw[i * 6 + 5], check, 8 * bytesOf(DT_F16)); } else { vst1q_f16(Iw[i * 6 + 5], v_Iw5); } diff --git a/compute/tensor/src/cpu/arm/fp16/deconvolution_transform.cpp b/compute/tensor/src/cpu/arm/fp16/deconvolution_transform.cpp index 85e52e08..19a57973 100644 --- a/compute/tensor/src/cpu/arm/fp16/deconvolution_transform.cpp +++ b/compute/tensor/src/cpu/arm/fp16/deconvolution_transform.cpp @@ -25,7 +25,7 @@ inline EE deconvolution_transform_filter_kernel_fp16(TensorDesc filterDesc, } if (filterDesc.df == ftmDataFormat) { *ftmDesc = filterDesc; - memcpy(ftmArray, filterArray, tensorNumBytes(filterDesc)); + UNI_MEMCPY(ftmArray, filterArray, tensorNumBytes(filterDesc)); return SUCCESS; } if (filterDesc.df != DF_NCHW) { diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A55.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A55.cpp index 2998aa5a..e6201bb7 100644 --- a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A55.cpp +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A55.cpp @@ -43,10 +43,10 @@ EE depthwise_pointwise_convolution_direct_A55(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 dilateH = convParamSpec.dilatedRate_h; U32 dilateW = convParamSpec.dilatedRate_w; @@ -72,20 +72,20 @@ EE depthwise_pointwise_convolution_direct_A55(TensorDesc inputDesc, F16 *inArray_mov = inArray + n * ic * ihiw * 8; for (U32 c = 0; c < ic; c++) { if (paddingT > 0) { - memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt)); inArray_pad_mov += paddingT * iw_pad * 8; } for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt)); inArray_pad_mov += paddingL * 8; - memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt)); + UNI_MEMCPY(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt)); inArray_pad_mov += iw * 8; inArray_mov += iw * 8; - memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt)); inArray_pad_mov += paddingR * 8; } if (paddingB > 0) { - memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt)); inArray_pad_mov += paddingB * iw_pad * 8; } @@ -138,30 +138,29 @@ EE depthwise_pointwise_convolution_direct_A55(TensorDesc inputDesc, F16 *in_5 = in_idx + in_h_5 * iw_pad * 8 + in_w_5 * 8; F16 *in_6 = in_idx + in_h_6 * iw_pad * 8 + in_w_6 * 8; F16 *in_7 = in_idx + in_h_7 * iw_pad * 8 + in_w_7 * 8; - __asm__ __volatile__("ldr q17, [%[f0]]\n" - "ldr q9, [%[in0]]\n" - "ldr q10, [%[in1]]\n" - "ldr q11, [%[in2]]\n" - "ldr q12, [%[in3]]\n" - "ldr q13, [%[in4]]\n" - "ldr q14, [%[in5]]\n" - "ldr q15, [%[in6]]\n" - "ldr q16, [%[in7]]\n" - "fmla v0.8h, v9.8h, v17.8h\n" - "fmla v1.8h, v10.8h, v17.8h\n" - "fmla v2.8h, v11.8h, v17.8h\n" - "fmla v3.8h, v12.8h, v17.8h\n" - "fmla v4.8h, v13.8h, v17.8h\n" - "fmla v5.8h, v14.8h, v17.8h\n" - "fmla v6.8h, v15.8h, v17.8h\n" - "fmla v7.8h, v16.8h, v17.8h\n" - : - : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), - [in3] "r"(in_3), [in4] "r"(in_4), [in5] "r"(in_5), - [in6] "r"(in_6), [in7] "r"(in_7), [f0] "r"(f_0) - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", - "v6", "v7", "v9", "v10", "v11", "v12", "v13", "v14", - "v15", "v16", "v17"); + __asm__ __volatile__( + "ldr q17, [%[f0]]\n" + "ldr q9, [%[in0]]\n" + "ldr q10, [%[in1]]\n" + "ldr q11, [%[in2]]\n" + "ldr q12, [%[in3]]\n" + "ldr q13, [%[in4]]\n" + "ldr q14, [%[in5]]\n" + "ldr q15, [%[in6]]\n" + "ldr q16, [%[in7]]\n" + "fmla v0.8h, v9.8h, v17.8h\n" + "fmla v1.8h, v10.8h, v17.8h\n" + "fmla v2.8h, v11.8h, v17.8h\n" + "fmla v3.8h, v12.8h, v17.8h\n" + "fmla v4.8h, v13.8h, v17.8h\n" + "fmla v5.8h, v14.8h, v17.8h\n" + "fmla v6.8h, v15.8h, v17.8h\n" + "fmla v7.8h, v16.8h, v17.8h\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), [in3] "r"(in_3), + [in4] "r"(in_4), [in5] "r"(in_5), [in6] "r"(in_6), [in7] "r"(in_7), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17"); } } diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A76.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A76.cpp index 46d0c628..8c49bf9c 100644 --- a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A76.cpp +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_A76.cpp @@ -43,10 +43,10 @@ EE depthwise_pointwise_convolution_direct_A76(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 dilateH = convParamSpec.dilatedRate_h; U32 dilateW = convParamSpec.dilatedRate_w; @@ -71,20 +71,20 @@ EE depthwise_pointwise_convolution_direct_A76(TensorDesc inputDesc, F16 *inArray_mov = inArray + n * ic * ihiw * 8; for (U32 c = 0; c < ic; c++) { if (paddingT > 0) { - memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt)); inArray_pad_mov += paddingT * iw_pad * 8; } for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt)); inArray_pad_mov += paddingL * 8; - memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt)); + UNI_MEMCPY(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt)); inArray_pad_mov += iw * 8; inArray_mov += iw * 8; - memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt)); inArray_pad_mov += paddingR * 8; } if (paddingB > 0) { - memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt)); inArray_pad_mov += paddingB * iw_pad * 8; } @@ -137,30 +137,29 @@ EE depthwise_pointwise_convolution_direct_A76(TensorDesc inputDesc, F16 *in_5 = in_idx + in_h_5 * iw_pad * 8 + in_w_5 * 8; F16 *in_6 = in_idx + in_h_6 * iw_pad * 8 + in_w_6 * 8; F16 *in_7 = in_idx + in_h_7 * iw_pad * 8 + in_w_7 * 8; - __asm__ __volatile__("ldr q17, [%[f0]]\n" - "ldr q9, [%[in0]]\n" - "ldr q10, [%[in1]]\n" - "ldr q11, [%[in2]]\n" - "ldr q12, [%[in3]]\n" - "ldr q13, [%[in4]]\n" - "ldr q14, [%[in5]]\n" - "ldr q15, [%[in6]]\n" - "ldr q16, [%[in7]]\n" - "fmla v0.8h, v9.8h, v17.8h\n" - "fmla v1.8h, v10.8h, v17.8h\n" - "fmla v2.8h, v11.8h, v17.8h\n" - "fmla v3.8h, v12.8h, v17.8h\n" - "fmla v4.8h, v13.8h, v17.8h\n" - "fmla v5.8h, v14.8h, v17.8h\n" - "fmla v6.8h, v15.8h, v17.8h\n" - "fmla v7.8h, v16.8h, v17.8h\n" - : - : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), - [in3] "r"(in_3), [in4] "r"(in_4), [in5] "r"(in_5), - [in6] "r"(in_6), [in7] "r"(in_7), [f0] "r"(f_0) - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", - "v6", "v7", "v9", "v10", "v11", "v12", "v13", "v14", - "v15", "v16", "v17"); + __asm__ __volatile__( + "ldr q17, [%[f0]]\n" + "ldr q9, [%[in0]]\n" + "ldr q10, [%[in1]]\n" + "ldr q11, [%[in2]]\n" + "ldr q12, [%[in3]]\n" + "ldr q13, [%[in4]]\n" + "ldr q14, [%[in5]]\n" + "ldr q15, [%[in6]]\n" + "ldr q16, [%[in7]]\n" + "fmla v0.8h, v9.8h, v17.8h\n" + "fmla v1.8h, v10.8h, v17.8h\n" + "fmla v2.8h, v11.8h, v17.8h\n" + "fmla v3.8h, v12.8h, v17.8h\n" + "fmla v4.8h, v13.8h, v17.8h\n" + "fmla v5.8h, v14.8h, v17.8h\n" + "fmla v6.8h, v15.8h, v17.8h\n" + "fmla v7.8h, v16.8h, v17.8h\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), [in3] "r"(in_3), + [in4] "r"(in_4), [in5] "r"(in_5), [in6] "r"(in_6), [in7] "r"(in_7), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17"); } } diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h index dca6b30d..864083b7 100644 --- a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding.h @@ -14,7 +14,6 @@ #ifndef _H_DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT_NO_PADDING #define _H_DEPTHWISE_POINTWISE_CONVOLUTION_DIRECT_NO_PADDING -#include #include "sys.h" #include "tensor_desc.h" #include "parameter_spec.h" diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp index e86fe2c9..d5dd04bd 100644 --- a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A55.cpp @@ -44,8 +44,8 @@ EE depthwise_pointwise_convolution_direct_no_padding_A55(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingL = convParamSpec.padding_left; + U32 paddingT = convParamSpec.pad_top; + U32 paddingL = convParamSpec.pad_left; if (dwFilterDesc.df != DF_NCHWC8 || pwFilterDesc.df != DF_NHWCN16) { CHECK_STATUS(NOT_MATCH); diff --git a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp index 24bcfb4a..8ea43e8c 100644 --- a/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp +++ b/compute/tensor/src/cpu/arm/fp16/depthwise_pointwise_convolution_direct_no_padding_A76.cpp @@ -44,8 +44,8 @@ EE depthwise_pointwise_convolution_direct_no_padding_A76(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingL = convParamSpec.padding_left; + U32 paddingT = convParamSpec.pad_top; + U32 paddingL = convParamSpec.pad_left; if (dwFilterDesc.df != DF_NCHWC8 || pwFilterDesc.df != DF_NHWCN16) { CHECK_STATUS(NOT_MATCH); diff --git a/compute/tensor/src/cpu/arm/fp16/gru.cpp b/compute/tensor/src/cpu/arm/fp16/gru.cpp index 28a46a65..4afe84d9 100644 --- a/compute/tensor/src/cpu/arm/fp16/gru.cpp +++ b/compute/tensor/src/cpu/arm/fp16/gru.cpp @@ -11,7 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include #include "cpu/arm/fp16/tensor_computing_fp16.h" #include "cpu/arm/fp16/mvm_nkn32.h" @@ -54,9 +53,9 @@ EE grucell_fp16(TensorDesc xDesc, U32 batch = in; I32 xDim = ix; - I32 hDim = rnnParamSpec.numOutput; + I32 hDim = rnnParamSpec.num_outputs; I32 column = hDim; - int num1 = rnnParamSpec.biDirection ? 2 : 1; + int num1 = rnnParamSpec.bi_direction ? 2 : 1; U32 steps = batchStrideH / hDim / num1; if (!(idt == DT_F16 && fdt == DT_F16 && odt == DT_F16)) { CHECK_STATUS(NOT_MATCH); @@ -64,8 +63,7 @@ EE grucell_fp16(TensorDesc xDesc, if (!(3 * column == (I32)fn * 32 && (ix + oh) == fk && in == on)) { CHECK_STATUS(NOT_MATCH); } - ActivationMode activationMode = rnnParamSpec.activationMode; - if (activationMode != ACTIVATION_TANH) { + if (rnnParamSpec.activation_type != ACTIVATION_TANH) { CHECK_STATUS(NOT_SUPPORTED); } @@ -84,16 +82,16 @@ EE grucell_fp16(TensorDesc xDesc, F16 *currentBatchH = currentHArray + m * currentHStride; F16 *currentOutput = outputArray + m * batchStrideH; if (xDim > 0) { - memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F16)); - memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(F16)); + UNI_MEMCPY(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F16)); + UNI_MEMCPY(xhArray + xDim, lastBatchH, hDim * sizeof(F16)); } else { intermediateH = tmpArray; xhArray = lastBatchH; - memcpy(currentOutput, lastBatchH, hDim * sizeof(F16)); + UNI_MEMCPY(currentOutput, lastBatchH, hDim * sizeof(F16)); } const F16 *mBias = (const F16 *)bias[0] + m * steps * column * 3; - memcpy(intermediateH, mBias, column * 2 * sizeof(F16)); + UNI_MEMCPY(intermediateH, mBias, column * 2 * sizeof(F16)); mvm_nkn32(column * 2 / 32, fk, (const F16 *)filter[0], xhArray, intermediateH); F16 *out_z = intermediateH; F16 *out_r = out_z + column; @@ -111,12 +109,12 @@ EE grucell_fp16(TensorDesc xDesc, if (rnnParamSpec.mode == RNN_GRU_LBR) { F16 *h_x_b = (F16 *)mBias + column * 2; F16 *h_h_b = (F16 *)bias[1]; - memcpy(out_h, h_h_b, column * sizeof(F16)); + UNI_MEMCPY(out_h, h_h_b, column * sizeof(F16)); mvm_nkn32(column / 32, hDim, (const F16 *)filter[0] + column * 2 * fk + column * xDim, xhArray + xDim, out_h); array_mul_f16(out_r, out_h, out_h, hDim); if (xDim > 0) { - memcpy(out_r, h_x_b, column * sizeof(F16)); + UNI_MEMCPY(out_r, h_x_b, column * sizeof(F16)); mvm_nkn32( column / 32, xDim, (const F16 *)filter[0] + column * 2 * fk, xhArray, out_r); h_x_b = out_r; @@ -124,7 +122,7 @@ EE grucell_fp16(TensorDesc xDesc, array_add_f16(h_x_b, out_h, out_h, hDim); } else { array_mul_f16(out_r, xhArray + xDim, xhArray + xDim, hDim); - memcpy(out_h, mBias + column * 2, column * sizeof(F16)); + UNI_MEMCPY(out_h, mBias + column * 2, column * sizeof(F16)); mvm_nkn32(column / 32, fk, (const F16 *)filter[0] + column * 2 * fk, xhArray, out_h); } for (h = 0; h < column - 7; h += 8) { @@ -147,7 +145,7 @@ EE grucell_fp16(TensorDesc xDesc, array_scale_f16(out_z, out_z, column, -1, 1); array_mul_f16(out_z, out_h, out_h, column); array_add_f16(out_r, out_h, currentOutput, column); - memcpy(currentBatchH, currentOutput, sizeof(F16) * hDim); + UNI_MEMCPY(currentBatchH, currentOutput, sizeof(F16) * hDim); } return SUCCESS; } diff --git a/compute/tensor/src/cpu/arm/fp16/lstm.cpp b/compute/tensor/src/cpu/arm/fp16/lstm.cpp index 52e1c9f1..f160d722 100644 --- a/compute/tensor/src/cpu/arm/fp16/lstm.cpp +++ b/compute/tensor/src/cpu/arm/fp16/lstm.cpp @@ -11,7 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include #include "cpu/arm/fp16/tensor_computing_fp16.h" #include "cpu/arm/fp16/mvm_nkn32.h" @@ -54,10 +53,10 @@ EE lstmcell_fp16(TensorDesc xDesc, U32 batch = in; I32 xDim = ix; - I32 hDim = rnnParamSpec.numOutput; - I32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection - : rnnParamSpec.numOutput; - int num1 = rnnParamSpec.biDirection ? 2 : 1; + I32 hDim = rnnParamSpec.num_outputs; + I32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection + : rnnParamSpec.num_outputs; + int num1 = rnnParamSpec.bi_direction ? 2 : 1; U32 steps = batchStrideH / hDim / num1; if (!(idt == DT_F16 && fdt == DT_F16 && odt == DT_F16)) { CHECK_STATUS(NOT_MATCH); @@ -65,9 +64,8 @@ EE lstmcell_fp16(TensorDesc xDesc, if (!(4 * column == (I32)fn * 32 && (ix + oh) == fk && in == on)) { CHECK_STATUS(NOT_MATCH); } - F32 forgetBias = rnnParamSpec.forgetBias; - ActivationMode activationMode = rnnParamSpec.activationMode; - if (activationMode != ACTIVATION_TANH) { + F32 forgetBias = rnnParamSpec.forget_bias; + if (rnnParamSpec.activation_type != ACTIVATION_TANH) { CHECK_STATUS(NOT_SUPPORTED); } @@ -88,15 +86,15 @@ EE lstmcell_fp16(TensorDesc xDesc, for (U32 m = 0; m < batch; m++) { F16 *lastBatchH = lastHArray + m * lastHStride; if (xDim > 0) { - memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F16)); - memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(F16)); + UNI_MEMCPY(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F16)); + UNI_MEMCPY(xhArray + xDim, lastBatchH, hDim * sizeof(F16)); } else { intermediateH = tmpArray; xhArray = lastBatchH; } const F16 *mBias = (const F16 *)bias[0] + m * steps * column * 4; - memcpy(intermediateH, mBias, column * 4 * sizeof(F16)); + UNI_MEMCPY(intermediateH, mBias, column * 4 * sizeof(F16)); mvm_nkn32(fn, fk, (const F16 *)filter[0], xhArray, intermediateH); F16 *out_i = intermediateH; @@ -110,12 +108,12 @@ EE lstmcell_fp16(TensorDesc xDesc, F16 *currentOutput = outputArray + m * batchStrideH; F16 *tmpState, *tmpHH, *tmpH; - if (rnnParamSpec.zoneoutCell == 0) { + if (rnnParamSpec.zoneout_cell == 0) { tmpState = currentBatchState; } else { tmpState = out_i; } - if (rnnParamSpec.numProjection > 0) { + if (rnnParamSpec.num_projection > 0) { tmpHH = out_g; tmpH = currentOutput; } else { @@ -150,26 +148,26 @@ EE lstmcell_fp16(TensorDesc xDesc, tmpState[h] = C_s; tmpHH[h] = value; } - if (rnnParamSpec.zoneoutCell != 0) { - array_scale_f16(tmpState, tmpState, column, 1 - rnnParamSpec.zoneoutCell, 0); - array_scale_f16(lastBatchState, lastBatchState, column, rnnParamSpec.zoneoutCell, 0); + if (rnnParamSpec.zoneout_cell != 0) { + array_scale_f16(tmpState, tmpState, column, 1 - rnnParamSpec.zoneout_cell, 0); + array_scale_f16(lastBatchState, lastBatchState, column, rnnParamSpec.zoneout_cell, 0); array_add_f16(tmpState, lastBatchState, currentBatchState, column); } - if (rnnParamSpec.numProjection > 0) { - memset(tmpH, 0, sizeof(F16) * hDim); - mvm_nkn32(hDim / 32, rnnParamSpec.numProjection, (const F16 *)filter[1], tmpHH, tmpH); + if (rnnParamSpec.num_projection > 0) { + UNI_MEMSET(tmpH, 0, sizeof(F16) * hDim); + mvm_nkn32(hDim / 32, rnnParamSpec.num_projection, (const F16 *)filter[1], tmpHH, tmpH); } - if (rnnParamSpec.zoneoutOutput != 0) { - if (rnnParamSpec.numProjection > 0) { - array_scale_f16(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + if (rnnParamSpec.zoneout_output != 0) { + if (rnnParamSpec.num_projection > 0) { + array_scale_f16(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneout_output, 0); } else { - array_scale_f16(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + array_scale_f16(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneout_output, 0); } - array_scale_f16(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneoutOutput, 0); + array_scale_f16(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneout_output, 0); array_add_f16(out_f, lastBatchH, currentBatchH, hDim); } else { - memcpy(currentBatchH, currentOutput, sizeof(F16) * hDim); + UNI_MEMCPY(currentBatchH, currentOutput, sizeof(F16) * hDim); } } return SUCCESS; diff --git a/compute/tensor/src/cpu/arm/fp16/normalization.cpp b/compute/tensor/src/cpu/arm/fp16/normalization.cpp index 503e2970..97285ecf 100644 --- a/compute/tensor/src/cpu/arm/fp16/normalization.cpp +++ b/compute/tensor/src/cpu/arm/fp16/normalization.cpp @@ -14,10 +14,11 @@ #include #include "cpu/arm/fp16/tensor_computing_fp16.h" -inline void array_norm_scale_fp16( +static float eps = 1e-6; + +inline static void array_norm_scale_fp16( F16 *input, F16 *output, I32 len, F32 mean, F32 var, F16 *alpha, F16 *beta) { - F32 eps = 1e-6; F32 std_value = sqrt(var + eps); float16x8_t mean_v = vdupq_n_f16(mean); float16x8_t std_v = vdupq_n_f16(std_value); @@ -38,14 +39,10 @@ inline void array_norm_scale_fp16( } } -EE layer_normalization_fp16( +static EE layer_normalization_nhwc( TensorDesc inputDesc, F16 *input, F16 *alpha, F16 *beta, TensorDesc outputDesc, F16 *output) { UNUSED(outputDesc); - if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } - U32 size = tensorNumElements(inputDesc); I32 size_inner = inputDesc.dims[0]; I32 size_outer = size / size_inner; @@ -57,6 +54,77 @@ EE layer_normalization_fp16( array_norm_scale_fp16(current_input, current_output, size_inner, mean, var, alpha, beta); } + return SUCCESS; +} + +static EE layer_normalization_nchwc8( + TensorDesc inputDesc, F16 *input, F16 *alpha, F16 *beta, TensorDesc outputDesc, F16 *output) +{ + UNUSED(outputDesc); + int n = inputDesc.dims[inputDesc.nDims - 1]; + int c = inputDesc.dims[inputDesc.nDims - 2]; + int hw = 1; + for (unsigned int i = 0; i < inputDesc.nDims - 2; i++) { + hw *= inputDesc.dims[i]; + } + int c8 = c / 8; + for (int i = 0; i < n; i++) { + for (int j = 0; j < hw; j++) { + float16x8_t sum_v = vdupq_n_f16(0); + for (int k = 0; k < c8; k++) { + int id = ((i * c8 + k) * hw + j) * 8; + sum_v = vaddq_f16(sum_v, vld1q_f16(input + id)); + } + F32 mean = vaddvq_f16(sum_v) / c; + float16x8_t mean_v = vdupq_n_f16(mean); + sum_v = vdupq_n_f16(0); + for (int k = 0; k < c8; k++) { + int id = ((i * c8 + k) * hw + j) * 8; + float16x8_t tmp_v = vsubq_f16(vld1q_f16(input + id), mean_v); + sum_v = vfmaq_f16(sum_v, tmp_v, tmp_v); + } + F32 var = vaddvq_f16(sum_v) / c; + F32 std_value = sqrt(var + eps); + + float16x8_t std_v = vdupq_n_f16(std_value); + for (int k = 0, kk = 0; k < c8; k++, kk += 8) { + int id = ((i * c8 + k) * hw + j) * 8; + float16x8_t in = vld1q_f16(input + id); + float16x8_t alpha_v = vld1q_f16(alpha + kk); + float16x8_t beta_v = vld1q_f16(beta + kk); + + float16x8_t tmp_v = vsubq_f16(in, mean_v); + tmp_v = vdivq_f16(tmp_v, std_v); + tmp_v = vfmaq_f16(beta_v, alpha_v, tmp_v); + vst1q_f16(output + id, tmp_v); + } + } + } return SUCCESS; } + +EE layer_normalization_fp16(TensorDesc inputDesc, + F16 *input, + LayerNormParamSpec p, + F16 *alpha, + F16 *beta, + TensorDesc outputDesc, + F16 *output) +{ + if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + EE ret = NOT_SUPPORTED; + if (inputDesc.df == DF_NCHWC8) { + if (p.axis == 1) { + ret = layer_normalization_nchwc8(inputDesc, input, alpha, beta, outputDesc, output); + } + } else { + if (p.axis == -1) { + ret = layer_normalization_nhwc(inputDesc, input, alpha, beta, outputDesc, output); + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp16/scale.cpp b/compute/tensor/src/cpu/arm/fp16/scale.cpp index 0f736c7c..35148077 100644 --- a/compute/tensor/src/cpu/arm/fp16/scale.cpp +++ b/compute/tensor/src/cpu/arm/fp16/scale.cpp @@ -35,28 +35,39 @@ EE scale_nchwc8_fp16( return SUCCESS; } +template EE scale_nchw_fp16( F16 *input, F16 *alpha, F16 *beta, I32 in, I32 ic, I32 elements_per_channel, F16 *output) { float16x8_t one = vdupq_n_f16(1.); float16x8_t zero = vdupq_n_f16(0.); - U32 index = 0; + U32 dst = 0, src = 0; for (I32 n = 0; n < in; n++) { for (I32 c = 0; c < ic; c++) { float16x8_t alpha_vec = (alpha == nullptr) ? one : vdupq_n_f16(alpha[c]); float16x8_t beta_vec = (beta == nullptr) ? zero : vdupq_n_f16(beta[c]); I32 i = 0; for (; i < elements_per_channel - 7; i += 8) { - float16x8_t in_vec = vld1q_f16(input + index); + if (icoc_equal) { + src = (n * ic + c) * elements_per_channel + i; + } else { + src = n * elements_per_channel + i; + } + float16x8_t in_vec = vld1q_f16(input + src); float16x8_t out_vec = vfmaq_f16(beta_vec, alpha_vec, in_vec); - vst1q_f16(output + index, out_vec); - index += 8; + vst1q_f16(output + dst, out_vec); + dst += 8; } for (; i < elements_per_channel; i++) { + if (icoc_equal) { + src = (n * ic + c) * elements_per_channel + i; + } else { + src = n * elements_per_channel + i; + } float alpha_s = (alpha == nullptr) ? 1 : alpha[c]; float beta_s = (beta == nullptr) ? 0 : beta[c]; - output[index] = alpha_s * input[index] + beta_s; - index++; + output[dst] = alpha_s * input[src] + beta_s; + dst++; } } } @@ -119,7 +130,11 @@ EE scale_fp16(F16 *input, EE ret = SUCCESS; // If oc is 1, it means that weights/vectors have only one param, so we need use the calculation logic of nchw. if (axis == 1 || axis == 0 || oc == 1) { - ret = scale_nchw_fp16(input, alpha, beta, on, oc, elements_per_channel, output); + if (ic == oc) { + ret = scale_nchw_fp16(input, alpha, beta, on, oc, elements_per_channel, output); + } else { + ret = scale_nchw_fp16(input, alpha, beta, on, oc, elements_per_channel, output); + } } else if (axis == nDims - 1) { if (ic == oc) { ret = scale_nhwc_fp16(input, alpha, beta, on, oc, elements_per_channel, output); diff --git a/compute/tensor/src/cpu/arm/fp16/softmax.cpp b/compute/tensor/src/cpu/arm/fp16/softmax.cpp index 2e5b4178..31ecac0b 100644 --- a/compute/tensor/src/cpu/arm/fp16/softmax.cpp +++ b/compute/tensor/src/cpu/arm/fp16/softmax.cpp @@ -14,59 +14,77 @@ #include "cpu/arm/fp16/tensor_computing_fp16.h" #include "tensor_transpose.h" -void softmax_lastAxis_fp16(const F16 *input, I32 loopOuter, I32 loops, F16 *output) +template +static void softmax_lastAxis_fp16(const F16 *input, I32 loopOuter, I32 loops, F16 *output) { for (I32 i = 0; i < loopOuter; i++) { const F16 *inputPtr = input + i * loops; F16 *outputPtr = output + i * loops; - float16x8_t max_v, sub_v, sum_v, tmp_v; + float16x8_t max_v, tmp_v; F32 max_s, tmp_s; - array_minmax_value_f16(inputPtr, loops, 2, &max_s); - max_v = vdupq_n_f16(max_s); - sum_v = vdupq_n_f16(0); - + if (!logsoftmax) { + array_minmax_value_f16(inputPtr, loops, 2, &max_s); + max_v = vdupq_n_f16(max_s); + } I32 j = 0; - F32 sum_s = 0; - for (j = 0; j < loops - 7; j += 8) { + float16x8_t sum_v = vdupq_n_f16(0); + for (; j < loops - 7; j += 8) { float16x8_t in = vld1q_f16(inputPtr + j); - sub_v = vsubq_f16(in, max_v); - tmp_v = vexpq_f16_f32(sub_v); + if (!logsoftmax) { + in = vsubq_f16(in, max_v); + } + tmp_v = vexpq_f16_f32(in); sum_v = vaddq_f16(sum_v, tmp_v); - vst1q_f16(outputPtr + j, tmp_v); + if (!logsoftmax) { + vst1q_f16(outputPtr + j, tmp_v); + } } - sum_s += vaddvq_f16(sum_v); + F32 sum_s = vaddvq_f16(sum_v); for (; j < loops; j++) { - tmp_s = exp(inputPtr[j] - max_s); - outputPtr[j] = tmp_s; + if (logsoftmax) { + tmp_s = exp(inputPtr[j]); + } else { + tmp_s = exp(inputPtr[j] - max_s); + outputPtr[j] = tmp_s; + } sum_s += tmp_s; } - array_scale_f16(outputPtr, outputPtr, loops, 1.0 / sum_s, 0); + if (logsoftmax) { + array_scale_f16(inputPtr, outputPtr, loops, 1.0, -log(sum_s)); + } else { + array_scale_f16(outputPtr, outputPtr, loops, 1.0 / sum_s, 0); + } } } -void softmax_anyAxis_fp16(const F16 *input, I32 loopOuter, I32 loops, I32 loopInner, F16 *output) +template +static void softmax_anyAxis_fp16( + const F16 *input, I32 loopOuter, I32 loops, I32 loopInner, F16 *output) { std::vector buffer(loopInner * 2); F16 *maxBuffer = &buffer[0]; F16 *sumBuffer = &buffer[loopInner]; I32 k = 0; + F32 tmp_s; for (I32 i = 0; i < loopOuter; i++) { const F16 *inputPtrBase = input + i * loops * loopInner; F16 *outputPtrBase = output + i * loops * loopInner; - memcpy(maxBuffer, inputPtrBase, loopInner * sizeof(F16)); - memset(sumBuffer, 0, loopInner * sizeof(F16)); - for (I32 j = 1; j < loops; j++) { - const F16 *inputPtr = inputPtrBase + j * loopInner; - for (k = 0; k < loopInner - 7; k += 8) { - float16x8_t in_v = vld1q_f16(inputPtr + k); - float16x8_t out_v = vld1q_f16(maxBuffer + k); - float16x8_t max_v = vmaxq_f16(in_v, out_v); - vst1q_f16(maxBuffer + k, max_v); - } - for (; k < loopInner; k++) { - maxBuffer[k] = UNI_MAX(maxBuffer[k], inputPtr[k]); + UNI_MEMSET(sumBuffer, 0, loopInner * sizeof(F16)); + if (!logsoftmax) { + UNI_MEMCPY(maxBuffer, inputPtrBase, loopInner * sizeof(F16)); + for (I32 j = 1; j < loops; j++) { + const F16 *inputPtr = inputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 7; k += 8) { + float16x8_t in_v = vld1q_f16(inputPtr + k); + float16x8_t out_v = vld1q_f16(maxBuffer + k); + float16x8_t max_v = vmaxq_f16(in_v, out_v); + vst1q_f16(maxBuffer + k, max_v); + } + for (; k < loopInner; k++) { + maxBuffer[k] = UNI_MAX(maxBuffer[k], inputPtr[k]); + } } } for (I32 j = 0; j < loops; j++) { @@ -74,35 +92,69 @@ void softmax_anyAxis_fp16(const F16 *input, I32 loopOuter, I32 loops, I32 loopIn F16 *outputPtr = outputPtrBase + j * loopInner; for (k = 0; k < loopInner - 7; k += 8) { float16x8_t in_v = vld1q_f16(inputPtr + k); - float16x8_t max_v = vld1q_f16(maxBuffer + k); - float16x8_t sub_v = vsubq_f16(in_v, max_v); - float16x8_t exp_v = vexpq_f16_f32(sub_v); + if (!logsoftmax) { + in_v = vsubq_f16(in_v, vld1q_f16(maxBuffer + k)); + } + float16x8_t exp_v = vexpq_f16_f32(in_v); float16x8_t sum_v = vld1q_f16(sumBuffer + k); sum_v = vaddq_f16(sum_v, exp_v); vst1q_f16(sumBuffer + k, sum_v); - vst1q_f16(outputPtr + k, exp_v); + if (!logsoftmax) { + vst1q_f16(outputPtr + k, exp_v); + } } for (; k < loopInner; k++) { - outputPtr[k] = exp(inputPtr[k] - maxBuffer[k]); - sumBuffer[k] += outputPtr[k]; + if (logsoftmax) { + tmp_s = exp(inputPtr[k]); + } else { + tmp_s = exp(inputPtr[k] - maxBuffer[k]); + outputPtr[k] = tmp_s; + } + sumBuffer[k] += tmp_s; } } - for (I32 j = 0; j < loops; j++) { - F16 *outputPtr = outputPtrBase + j * loopInner; + if (logsoftmax) { for (k = 0; k < loopInner - 7; k += 8) { - float16x8_t out_v = vld1q_f16(outputPtr + k); float16x8_t sum_v = vld1q_f16(sumBuffer + k); - out_v = vdivq_f16(out_v, sum_v); - vst1q_f16(outputPtr + k, out_v); + sum_v = vlogq_f16(sum_v); + vst1q_f16(sumBuffer + k, sum_v); } for (; k < loopInner; k++) { - outputPtr[k] /= sumBuffer[k]; + sumBuffer[k] = log(sumBuffer[k]); + } + for (I32 j = 0; j < loops; j++) { + const F16 *inputPtr = inputPtrBase + j * loopInner; + F16 *outputPtr = outputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 7; k += 8) { + float16x8_t out_v = vld1q_f16(inputPtr + k); + float16x8_t sum_v = vld1q_f16(sumBuffer + k); + out_v = vsubq_f16(out_v, sum_v); + vst1q_f16(outputPtr + k, out_v); + } + for (; k < loopInner; k++) { + outputPtr[k] -= sumBuffer[k]; + } + } + } else { + for (I32 j = 0; j < loops; j++) { + F16 *outputPtr = outputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 7; k += 8) { + float16x8_t out_v = vld1q_f16(outputPtr + k); + float16x8_t sum_v = vld1q_f16(sumBuffer + k); + out_v = vdivq_f16(out_v, sum_v); + vst1q_f16(outputPtr + k, out_v); + } + for (; k < loopInner; k++) { + outputPtr[k] /= sumBuffer[k]; + } } } } } -EE softmax_fp16(TensorDesc inputDesc, const F16 *input, int axis, TensorDesc outputDesc, F16 *output) +template +static EE softmax_kernel( + TensorDesc inputDesc, const F16 *input, int axis, TensorDesc outputDesc, F16 *output) { UNUSED(outputDesc); if (nullptr == input || nullptr == output) { @@ -145,9 +197,20 @@ EE softmax_fp16(TensorDesc inputDesc, const F16 *input, int axis, TensorDesc out } U32 loop_outer = size / loops / loop_inner; if (axis == 0) { - softmax_lastAxis_fp16(input, loop_outer, loops, output); + softmax_lastAxis_fp16(input, loop_outer, loops, output); } else { - softmax_anyAxis_fp16(input, loop_outer, loops, loop_inner, output); + softmax_anyAxis_fp16(input, loop_outer, loops, loop_inner, output); } return SUCCESS; } + +EE softmax_fp16(TensorDesc inputDesc, const F16 *input, int axis, TensorDesc outputDesc, F16 *output) +{ + return softmax_kernel(inputDesc, input, axis, outputDesc, output); +} + +EE logsoftmax_fp16( + TensorDesc inputDesc, const F16 *input, int axis, TensorDesc outputDesc, F16 *output) +{ + return softmax_kernel(inputDesc, input, axis, outputDesc, output); +} diff --git a/compute/tensor/src/cpu/arm/fp16/tensor_computing_fp16.h b/compute/tensor/src/cpu/arm/fp16/tensor_computing_fp16.h index a9e008de..ca7b1b79 100644 --- a/compute/tensor/src/cpu/arm/fp16/tensor_computing_fp16.h +++ b/compute/tensor/src/cpu/arm/fp16/tensor_computing_fp16.h @@ -67,6 +67,12 @@ EE pooling_c8_fp16(const I32 &tstart, EE softmax_fp16( TensorDesc inputDesc, const F16 *input, int axis, TensorDesc outputDesc, F16 *output); +EE logsoftmax_fp16( + TensorDesc inputDesc, const F16 *input, int axis, TensorDesc outputDesc, F16 *output); + +EE logsoftmax_fp16( + TensorDesc inputDesc, const F16 *input, int axis, TensorDesc outputDesc, F16 *output); + EE attention_fp16(U32 batch, U32 numHeads, I32 fromSequenceLength, @@ -167,8 +173,13 @@ EE power_fp16(TensorDesc inputDesc, TensorDesc outputDesc, F16 *output); -EE layer_normalization_fp16( - TensorDesc inputDesc, F16 *input, F16 *alpha, F16 *beta, TensorDesc outputDesc, F16 *output); +EE layer_normalization_fp16(TensorDesc inputDesc, + F16 *input, + LayerNormParamSpec p, + F16 *alpha, + F16 *beta, + TensorDesc outputDesc, + F16 *output); EE scale_fp16(F16 *input, I32 axis, diff --git a/compute/tensor/src/cpu/arm/fp32/arm_functions_fp32.h b/compute/tensor/src/cpu/arm/fp32/arm_functions_fp32.h index 9d2e9cf8..a6797bb5 100644 --- a/compute/tensor/src/cpu/arm/fp32/arm_functions_fp32.h +++ b/compute/tensor/src/cpu/arm/fp32/arm_functions_fp32.h @@ -14,11 +14,8 @@ #ifndef _H_ARM_FUNCTIONS_FP32 #define _H_ARM_FUNCTIONS_FP32 -#include +#include "cpu/cpu_functions_template.h" #include "arm_neon_expand.h" -#include "uni.h" -#include "data_type.h" -#include "parameter_spec.h" // array sum inline F32 array_sum_f32(const F32 *data, I32 len) @@ -232,7 +229,7 @@ inline void array_power_f32(F32 *input, F32 *output, I32 len, F32 power) #endif } else if (power == 1) { if (input != output) { - memcpy(output, input, len * sizeof(F32)); + UNI_MEMCPY(output, input, len * sizeof(F32)); } i = len; } else if (power == 2) { @@ -249,137 +246,109 @@ inline void array_power_f32(F32 *input, F32 *output, I32 len, F32 power) inline EE activation_fp32(F32 *input, U32 len, ActivationParamSpec activationDesc, F32 *output) { - float32x4_t in, out; float32x4_t zero = vdupq_n_f32(0.); float32x4_t one = vdupq_n_f32(1.); float32x4_t three = vdupq_n_f32(3.); float32x4_t six = vdupq_n_f32(6.); - U32 len_main = len / 4; - U32 len_tail = len % 4; - - F32 value; + U32 loops = len / 4 * 4; EE ret = SUCCESS; switch (activationDesc.mode) { case ACTIVATION_NULL: { + if (output != input) { + UNI_MEMCPY(output, input, sizeof(float) * len); + } + loops = len; break; } case ACTIVATION_RELU: { if (activationDesc.value[0] == 0) { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); - out = vmaxq_f32(zero, in); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - output[i] = (input[i] < 0) ? 0 : input[i]; +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t out = vmaxq_f32(zero, in); + vst1q_f32(output + i, out); } } else { float32x4_t scale = vdupq_n_f32(activationDesc.value[0]); - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 4) { + float32x4_t in = vld1q_f32(input + i); float32x4_t tmp = vmulq_f32(in, scale); - out = vmaxq_f32(tmp, in); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - float tmp = activationDesc.value[0] * input[i]; - output[i] = (input[i] < tmp) ? tmp : input[i]; + float32x4_t out = vmaxq_f32(tmp, in); + vst1q_f32(output + i, out); } } break; } case ACTIVATION_RELU6: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); - out = vmaxq_f32(zero, in); +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t out = vmaxq_f32(zero, in); out = vminq_f32(six, out); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - value = (input[i] < 0) ? 0 : input[i]; - if (value > 6) { - value = 6; - } - output[i] = value; + vst1q_f32(output + i, out); } break; } case ACTIVATION_H_SIGMOID: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); - out = vaddq_f32(in, three); +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t out = vaddq_f32(in, three); out = vmaxq_f32(out, zero); out = vminq_f32(out, six); out = vdivq_f32(out, six); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i] + 3; - value = (value < 0) ? 0 : value; - value = (value > 6) ? 6 : value; - value = value / 6; - output[i] = value; + vst1q_f32(output + i, out); } break; } case ACTIVATION_H_SWISH: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); - out = vaddq_f32(in, three); +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t out = vaddq_f32(in, three); out = vmaxq_f32(out, zero); out = vminq_f32(out, six); out = vdivq_f32(out, six); out = vmulq_f32(out, in); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i] + 3; - value = (value < 0) ? 0 : value; - value = (value > 6) ? 6 : value; - value = input[i] * value; - value = value / 6; - output[i] = value; + vst1q_f32(output + i, out); } break; } case ACTIVATION_H_SWISH_NODIV: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); - out = vaddq_f32(in, three); +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t out = vaddq_f32(in, three); out = vmaxq_f32(out, zero); out = vminq_f32(out, six); out = vmulq_f32(out, in); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i] + 3; - value = (value < 0) ? 0 : value; - value = (value > 6) ? 6 : value; - value = input[i] * value; - output[i] = value; + vst1q_f32(output + i, out); } break; } case ACTIVATION_GELU: { - F32 two_div_PI_sqrt = sqrt(2 / 3.14159265358979323846); - float32x4_t vec0 = vdupq_n_f32(two_div_PI_sqrt); + float32x4_t vec0 = vdupq_n_f32(sqrt(2 / 3.14159265358979323846)); float32x4_t vec1 = vdupq_n_f32(0.044715); float32x4_t vec2 = vdupq_n_f32(0.5); - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); - out = vmulq_f32(in, in); +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t out = vmulq_f32(in, in); out = vmulq_f32(out, in); out = vfmaq_f32(in, vec1, out); out = vmulq_f32(vec0, out); @@ -387,136 +356,122 @@ inline EE activation_fp32(F32 *input, U32 len, ActivationParamSpec activationDes out = vaddq_f32(one, out); out = vmulq_f32(vec2, out); out = vmulq_f32(in, out); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i]; - value = two_div_PI_sqrt * (value + 0.044715 * powf(value, 3)); - value = 1.0 - 2.0 / (exp(2.0 * value) + 1.0); - value = 0.5 * (1.0 + value); - value = input[i] * value; - output[i] = value; + vst1q_f32(output + i, out); } break; } case ACTIVATION_TANH: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); - out = vtanhq_f32(in); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - value = 1.0 - 2.0 / (exp(2.0 * input[i]) + 1.0); - output[i] = value; +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t out = vtanhq_f32(in); + vst1q_f32(output + i, out); } break; } case ACTIVATION_SIGMOID: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); - out = vsigmoidq_f32(in); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - value = 1.0 / (1.0 + exp(-1.0 * input[i])); - output[i] = value; +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t out = vsigmoidq_f32(in); + vst1q_f32(output + i, out); } break; } - case ACTIVATION_MISH: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); - out = vmulq_f32( - in, vtanhq_f32(vlogq_f32(vaddq_f32(vexpq_f32_03_percent_error(in), one)))); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i] * tanh(log(exp(input[i]) + 1.0)); - output[i] = value; + case ACTIVATION_SWISH: { +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t out = vmulq_f32(in, vsigmoidq_f32(in)); + vst1q_f32(output + i, out); } break; } - case ACTIVATION_GREATER: { - for (U32 i = 0; i < len; i++) { - output[i] = input[i] > 1 ? 1 : 0; + case ACTIVATION_MISH: { +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t out = vmulq_f32( + in, vtanhq_f32(vlogq_f32(vaddq_f32(vexpq_f32_03_percent_error(in), one)))); + vst1q_f32(output + i, out); } break; } case ACTIVATION_SOFTPLUS: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); - out = vlogq_f32(vaddq_f32(vexpq_f32_03_percent_error(in), one)); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - output[i] = log(1 + exp(input[i])); +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t out = vlogq_f32(vaddq_f32(vexpq_f32_03_percent_error(in), one)); + vst1q_f32(output + i, out); } break; } case ACTIVATION_EXP: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); - out = vexpq_f32_03_percent_error(in); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - output[i] = exp(input[i]); +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t out = vexpq_f32_03_percent_error(in); + vst1q_f32(output + i, out); } break; } case ACTIVATION_ABS: { - for (U32 i = 0; i < len_main; i++) { - in = vld1q_f32(input); - out = vabsq_f32(in); - vst1q_f32(output, out); - input += 4; - output += 4; - } - for (U32 i = 0; i < len_tail; i++) { - output[i] = UNI_ABS(input[i]); - } - break; - } - case ACTIVATION_SIGN: { - for (U32 i = 0; i < len; i++) { - output[i] = UNI_SIGN(input[i]); - } - break; - } - case ACTIVATION_LOG: { - for (U32 i = 0; i < len; i++) { - output[i] = log(input[i]); +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t out = vabsq_f32(in); + vst1q_f32(output + i, out); } break; } - case ACTIVATION_NOT: { - for (U32 i = 0; i < len; i++) { - output[i] = (input[i] > 0) ? 0 : 1; + case ACTIVATION_RECIPROCAL: { +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loops; i += 4) { + float32x4_t in = vld1q_f32(input + i); + float32x4_t out = vdivq_f32(one, in); + vst1q_f32(output + i, out); } break; } - case ACTIVATION_NEG: { - for (U32 i = 0; i < len; i++) { - output[i] = -input[i]; - } + case ACTIVATION_SIGN: + case ACTIVATION_LOG: + case ACTIVATION_NOT: + case ACTIVATION_GREATER: + case ACTIVATION_NEG: + case ACTIVATION_ROUND: + case ACTIVATION_CEIL: + case ACTIVATION_FLOOR: { + loops = 0; break; } default: ret = NOT_SUPPORTED; break; } + if (ret == SUCCESS) { +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = loops; i < len; i++) { + ret = activation_template(activationDesc, input[i], output + i); + } + } return ret; } diff --git a/compute/tensor/src/cpu/arm/fp32/attention.cpp b/compute/tensor/src/cpu/arm/fp32/attention.cpp index 6861cae6..ef01a118 100644 --- a/compute/tensor/src/cpu/arm/fp32/attention.cpp +++ b/compute/tensor/src/cpu/arm/fp32/attention.cpp @@ -11,7 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include #include "cpu/arm/fp32/tensor_computing_fp32.h" EE attention_fp32(U32 batch, @@ -26,14 +25,14 @@ EE attention_fp32(U32 batch, } F32 mask_s = -10000.0; - I32 count = array_sum_f32(input, toSequenceLength); - I32 valid = UNI_MIN(count, fromSequenceLength); float32x4_t mask_v = vdupq_n_f32(mask_s); float32x4_t one_v = vdupq_n_f32(1.0); for (U32 n = 0; n < batch; n++) { + U32 count = array_sum_f32(input, toSequenceLength); + U32 valid = UNI_MIN(count, (U32)fromSequenceLength); for (U32 i = 0; i < numHeads; i++) { if (i == 0) { - for (I32 j = 0; j < valid; j++) { + for (U32 j = 0; j < valid; j++) { if (j == 0) { I32 k = 0; for (; k < toSequenceLength - 3; k += 4) { @@ -47,12 +46,12 @@ EE attention_fp32(U32 batch, output[k] = value; } } else { - memcpy( + UNI_MEMCPY( output + j * toSequenceLength, output, toSequenceLength * sizeof(F32)); } } - for (I32 j = valid; j < fromSequenceLength; j++) { + for (U32 j = valid; j < (U32)fromSequenceLength; j++) { if (j == valid) { I32 k = 0; for (; k < toSequenceLength - 3; k += 4) { @@ -62,12 +61,12 @@ EE attention_fp32(U32 batch, output[j * toSequenceLength + k] = mask_s; } } else { - memcpy(output + j * toSequenceLength, output + valid * toSequenceLength, + UNI_MEMCPY(output + j * toSequenceLength, output + valid * toSequenceLength, toSequenceLength * sizeof(F32)); } } } else { - memcpy(output + i * fromSequenceLength * toSequenceLength, output, + UNI_MEMCPY(output + i * fromSequenceLength * toSequenceLength, output, fromSequenceLength * toSequenceLength * sizeof(F32)); } } diff --git a/compute/tensor/src/cpu/arm/fp32/attention_mask.cpp b/compute/tensor/src/cpu/arm/fp32/attention_mask.cpp index 3a34c6dc..3b3de80b 100644 --- a/compute/tensor/src/cpu/arm/fp32/attention_mask.cpp +++ b/compute/tensor/src/cpu/arm/fp32/attention_mask.cpp @@ -11,7 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include #include "cpu/arm/fp32/tensor_computing_fp32.h" EE attention_mask_fp32(TensorDesc inputDesc, @@ -56,7 +55,7 @@ EE attention_mask_fp32(TensorDesc inputDesc, if (start + loops > klen) { loops = UNI_MAX(klen - start, 0); } - memset(&mask[i * klen + start], 0, sizeof(F32) * loops); + UNI_MEMSET(&mask[i * klen + start], 0, sizeof(F32) * loops); } } I32 loops = tensorNumElements(inputDesc) / length; diff --git a/compute/tensor/src/cpu/arm/fp32/check.cpp b/compute/tensor/src/cpu/arm/fp32/check.cpp deleted file mode 100644 index 1e6894c7..00000000 --- a/compute/tensor/src/cpu/arm/fp32/check.cpp +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include "cpu/arm/fp32/tensor_computing_fp32.h" - -EE check_fp32(TensorDesc inputDescA, - const F32 *inputA, - TensorDesc inputDescB, - const F32 *inputB, - CheckMode checkMode, - TensorDesc outputDesc, - I32 *output) -{ - if (nullptr == inputA || nullptr == inputB || nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } - - if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) { - CHECK_STATUS(NOT_MATCH); - } - - U32 size = tensorNumElements(inputDescA); - U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1]; - I32 length = size / loopOuter; - if (tensorNumElements(outputDesc) != loopOuter) { - CHECK_STATUS(NOT_MATCH); - } - for (U32 j = 0; j < loopOuter; j++) { - const F32 *arrayA = inputA + j * length; - const F32 *arrayB = inputB + j * length; - switch (checkMode) { - case CHECK_GREAT: { - uint32x4_t count_v = vdupq_n_u32(0); - I32 i = 0; - for (; i < length - 3; i += 4) { - float32x4_t a = vld1q_f32(arrayA + i); - float32x4_t b = vld1q_f32(arrayA + i); - count_v = vaddq_u32(count_v, vcgtq_f32(a, b)); - } - I32 count = vaddvq_u32(count_v); - for (; i < length; i++) { - if (arrayA[i] > arrayB[i]) { - count++; - } - } - output[j] = (count == length); - break; - } - case CHECK_GREATEQUAL: { - uint32x4_t count_v = vdupq_n_u32(0); - I32 i = 0; - for (; i < length - 3; i += 4) { - float32x4_t a = vld1q_f32(arrayA + i); - float32x4_t b = vld1q_f32(arrayA + i); - count_v = vaddq_u32(count_v, vcgeq_f32(a, b)); - } - I32 count = vaddvq_u32(count_v); - for (; i < length; i++) { - if (arrayA[i] >= arrayB[i]) { - count++; - } - } - output[j] = (count == length); - break; - } - case CHECK_EQUAL: { - uint32x4_t count_v = vdupq_n_u32(0); - I32 i = 0; - for (; i < length - 3; i += 4) { - float32x4_t a = vld1q_f32(arrayA + i); - float32x4_t b = vld1q_f32(arrayA + i); - count_v = vaddq_u32(count_v, vceqq_f32(a, b)); - } - I32 count = vaddvq_u32(count_v); - for (; i < length; i++) { - if (arrayA[i] == arrayB[i]) { - count++; - } - } - output[j] = (count == length); - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } - } - return SUCCESS; -} diff --git a/compute/tensor/src/cpu/arm/fp32/clip.cpp b/compute/tensor/src/cpu/arm/fp32/clip.cpp index a0b591be..220056f0 100644 --- a/compute/tensor/src/cpu/arm/fp32/clip.cpp +++ b/compute/tensor/src/cpu/arm/fp32/clip.cpp @@ -21,14 +21,15 @@ EE clip_fp32(F32 *input, F32 *output, I32 len, F32 minValue, F32 maxValue) float32x4_t min_v = vdupq_n_f32(minValue); float32x4_t max_v = vdupq_n_f32(maxValue); - - I32 i = 0; - for (i = 0; i < len - 3; i += 4) { +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (int i = 0; i < len - 3; i += 4) { float32x4_t in = vld1q_f32(input + i); float32x4_t tmp_v = vminq_f32(max_v, vmaxq_f32(min_v, in)); vst1q_f32(output + i, tmp_v); } - for (; i < len; i++) { + for (int i = len / 4 * 4; i < len; i++) { F32 value = input[i]; value = (value > minValue) ? value : minValue; value = (value < maxValue) ? value : maxValue; diff --git a/compute/tensor/src/cpu/arm/fp32/convolution_transform.cpp b/compute/tensor/src/cpu/arm/fp32/convolution_transform.cpp index 376a8735..cdcbf16c 100644 --- a/compute/tensor/src/cpu/arm/fp32/convolution_transform.cpp +++ b/compute/tensor/src/cpu/arm/fp32/convolution_transform.cpp @@ -26,7 +26,7 @@ inline EE convolution_transform_filter_kernel_fp32(TensorDesc filterDesc, } if (filterDesc.df == ftmDataFormat) { *ftmDesc = filterDesc; - memcpy(ftmArray, filterArray, tensorNumBytes(filterDesc)); + UNI_MEMCPY(ftmArray, filterArray, tensorNumBytes(filterDesc)); return SUCCESS; } if (filterDesc.df != DF_NCHW) { diff --git a/compute/tensor/src/cpu/arm/fp32/convolution_winograd_transform.h b/compute/tensor/src/cpu/arm/fp32/convolution_winograd_transform.h index 098e9c67..4f2de717 100644 --- a/compute/tensor/src/cpu/arm/fp32/convolution_winograd_transform.h +++ b/compute/tensor/src/cpu/arm/fp32/convolution_winograd_transform.h @@ -16,7 +16,7 @@ #ifdef _USE_FP32 #include -#include + #include "cpu/arm/fp32/arm_functions_fp32.h" inline void trans_W_4x4_3x3(float *WTM[36], float *W[9]) diff --git a/compute/tensor/src/cpu/arm/fp32/deconvolution_transform.cpp b/compute/tensor/src/cpu/arm/fp32/deconvolution_transform.cpp index 79bbb56d..fb2bcd8e 100644 --- a/compute/tensor/src/cpu/arm/fp32/deconvolution_transform.cpp +++ b/compute/tensor/src/cpu/arm/fp32/deconvolution_transform.cpp @@ -25,7 +25,7 @@ inline EE deconvolution_transform_filter_kernel_fp32(TensorDesc filterDesc, } if (filterDesc.df == ftmDataFormat) { *ftmDesc = filterDesc; - memcpy(ftmArray, filterArray, tensorNumBytes(filterDesc)); + UNI_MEMCPY(ftmArray, filterArray, tensorNumBytes(filterDesc)); return SUCCESS; } if (filterDesc.df != DF_NCHW) { diff --git a/compute/tensor/src/cpu/arm/fp32/gru.cpp b/compute/tensor/src/cpu/arm/fp32/gru.cpp index eeb16490..584f0793 100644 --- a/compute/tensor/src/cpu/arm/fp32/gru.cpp +++ b/compute/tensor/src/cpu/arm/fp32/gru.cpp @@ -11,7 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include #include "cpu/arm/fp32/tensor_computing_fp32.h" #include "cpu/arm/fp32/mvm_nkn32.h" @@ -54,9 +53,9 @@ EE grucell_fp32(TensorDesc xDesc, U32 batch = in; I32 xDim = ix; - I32 hDim = rnnParamSpec.numOutput; + I32 hDim = rnnParamSpec.num_outputs; I32 column = hDim; - int num1 = rnnParamSpec.biDirection ? 2 : 1; + int num1 = rnnParamSpec.bi_direction ? 2 : 1; U32 steps = batchStrideH / hDim / num1; if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) { CHECK_STATUS(NOT_MATCH); @@ -64,8 +63,7 @@ EE grucell_fp32(TensorDesc xDesc, if (!(3 * column == (I32)fn * 32 && (ix + oh) == fk && in == on)) { CHECK_STATUS(NOT_MATCH); } - ActivationMode activationMode = rnnParamSpec.activationMode; - if (activationMode != ACTIVATION_TANH) { + if (rnnParamSpec.activation_type != ACTIVATION_TANH) { CHECK_STATUS(NOT_SUPPORTED); } @@ -84,16 +82,16 @@ EE grucell_fp32(TensorDesc xDesc, F32 *currentBatchH = currentHArray + m * currentHStride; F32 *currentOutput = outputArray + m * batchStrideH; if (xDim > 0) { - memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32)); - memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(F32)); + UNI_MEMCPY(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32)); + UNI_MEMCPY(xhArray + xDim, lastBatchH, hDim * sizeof(F32)); } else { intermediateH = tmpArray; xhArray = lastBatchH; - memcpy(currentOutput, lastBatchH, hDim * sizeof(F32)); + UNI_MEMCPY(currentOutput, lastBatchH, hDim * sizeof(F32)); } const F32 *mBias = (const F32 *)bias[0] + m * steps * column * 3; - memcpy(intermediateH, mBias, column * 2 * sizeof(F32)); + UNI_MEMCPY(intermediateH, mBias, column * 2 * sizeof(F32)); mvm_nkn32(column * 2 / 32, fk, (const F32 *)filter[0], xhArray, intermediateH); F32 *out_z = intermediateH; F32 *out_r = out_z + column; @@ -111,12 +109,12 @@ EE grucell_fp32(TensorDesc xDesc, if (rnnParamSpec.mode == RNN_GRU_LBR) { F32 *h_x_b = (F32 *)mBias + column * 2; F32 *h_h_b = (F32 *)bias[1]; - memcpy(out_h, h_h_b, column * sizeof(F32)); + UNI_MEMCPY(out_h, h_h_b, column * sizeof(F32)); mvm_nkn32(column / 32, hDim, (const F32 *)filter[0] + column * 2 * fk + column * xDim, xhArray + xDim, out_h); array_mul_f32(out_r, out_h, out_h, hDim); if (xDim > 0) { - memcpy(out_r, h_x_b, column * sizeof(F32)); + UNI_MEMCPY(out_r, h_x_b, column * sizeof(F32)); mvm_nkn32( column / 32, xDim, (const F32 *)filter[0] + column * 2 * fk, xhArray, out_r); h_x_b = out_r; @@ -124,7 +122,7 @@ EE grucell_fp32(TensorDesc xDesc, array_add_f32(h_x_b, out_h, out_h, hDim); } else { array_mul_f32(out_r, xhArray + xDim, xhArray + xDim, hDim); - memcpy(out_h, mBias + column * 2, column * sizeof(F32)); + UNI_MEMCPY(out_h, mBias + column * 2, column * sizeof(F32)); mvm_nkn32(column / 32, fk, (const F32 *)filter[0] + column * 2 * fk, xhArray, out_h); } for (h = 0; h < column - 3; h += 4) { @@ -147,7 +145,7 @@ EE grucell_fp32(TensorDesc xDesc, array_scale_f32(out_z, out_z, column, -1, 1); array_mul_f32(out_z, out_h, out_h, column); array_add_f32(out_r, out_h, currentOutput, column); - memcpy(currentBatchH, currentOutput, sizeof(F32) * hDim); + UNI_MEMCPY(currentBatchH, currentOutput, sizeof(F32) * hDim); } return SUCCESS; } diff --git a/compute/tensor/src/cpu/arm/fp32/lstm.cpp b/compute/tensor/src/cpu/arm/fp32/lstm.cpp index 1233d355..35f82da5 100644 --- a/compute/tensor/src/cpu/arm/fp32/lstm.cpp +++ b/compute/tensor/src/cpu/arm/fp32/lstm.cpp @@ -11,7 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include #include "cpu/arm/fp32/tensor_computing_fp32.h" #include "cpu/arm/fp32/mvm_nkn32.h" @@ -54,10 +53,10 @@ EE lstmcell_fp32(TensorDesc xDesc, U32 batch = in; I32 xDim = ix; - I32 hDim = rnnParamSpec.numOutput; - I32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection - : rnnParamSpec.numOutput; - int num1 = rnnParamSpec.biDirection ? 2 : 1; + I32 hDim = rnnParamSpec.num_outputs; + I32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection + : rnnParamSpec.num_outputs; + int num1 = rnnParamSpec.bi_direction ? 2 : 1; U32 steps = batchStrideH / hDim / num1; if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) { CHECK_STATUS(NOT_MATCH); @@ -65,9 +64,8 @@ EE lstmcell_fp32(TensorDesc xDesc, if (!(4 * column == (I32)fn * 32 && (ix + oh) == fk && in == on)) { CHECK_STATUS(NOT_MATCH); } - F32 forgetBias = rnnParamSpec.forgetBias; - ActivationMode activationMode = rnnParamSpec.activationMode; - if (activationMode != ACTIVATION_TANH) { + F32 forgetBias = rnnParamSpec.forget_bias; + if (rnnParamSpec.activation_type != ACTIVATION_TANH) { CHECK_STATUS(NOT_SUPPORTED); } @@ -88,15 +86,15 @@ EE lstmcell_fp32(TensorDesc xDesc, for (U32 m = 0; m < batch; m++) { F32 *lastBatchH = lastHArray + m * lastHStride; if (xDim > 0) { - memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32)); - memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(F32)); + UNI_MEMCPY(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32)); + UNI_MEMCPY(xhArray + xDim, lastBatchH, hDim * sizeof(F32)); } else { intermediateH = tmpArray; xhArray = lastBatchH; } const F32 *mBias = (const F32 *)bias[0] + m * steps * column * 4; - memcpy(intermediateH, mBias, column * 4 * sizeof(F32)); + UNI_MEMCPY(intermediateH, mBias, column * 4 * sizeof(F32)); mvm_nkn32(fn, fk, (const F32 *)filter[0], xhArray, intermediateH); F32 *out_i = intermediateH; F32 *out_g = out_i + column; @@ -109,12 +107,12 @@ EE lstmcell_fp32(TensorDesc xDesc, F32 *currentOutput = outputArray + m * batchStrideH; F32 *tmpState, *tmpHH, *tmpH; - if (rnnParamSpec.zoneoutCell == 0) { + if (rnnParamSpec.zoneout_cell == 0) { tmpState = currentBatchState; } else { tmpState = out_i; } - if (rnnParamSpec.numProjection > 0) { + if (rnnParamSpec.num_projection > 0) { tmpHH = out_g; tmpH = currentOutput; } else { @@ -149,26 +147,26 @@ EE lstmcell_fp32(TensorDesc xDesc, tmpState[h] = C_s; tmpHH[h] = value; } - if (rnnParamSpec.zoneoutCell != 0) { - array_scale_f32(tmpState, tmpState, column, 1 - rnnParamSpec.zoneoutCell, 0); - array_scale_f32(lastBatchState, lastBatchState, column, rnnParamSpec.zoneoutCell, 0); + if (rnnParamSpec.zoneout_cell != 0) { + array_scale_f32(tmpState, tmpState, column, 1 - rnnParamSpec.zoneout_cell, 0); + array_scale_f32(lastBatchState, lastBatchState, column, rnnParamSpec.zoneout_cell, 0); array_add_f32(tmpState, lastBatchState, currentBatchState, column); } - if (rnnParamSpec.numProjection > 0) { - memset(tmpH, 0, sizeof(F32) * hDim); - mvm_nkn32(hDim / 32, rnnParamSpec.numProjection, (const F32 *)filter[1], tmpHH, tmpH); + if (rnnParamSpec.num_projection > 0) { + UNI_MEMSET(tmpH, 0, sizeof(F32) * hDim); + mvm_nkn32(hDim / 32, rnnParamSpec.num_projection, (const F32 *)filter[1], tmpHH, tmpH); } - if (rnnParamSpec.zoneoutOutput != 0) { - if (rnnParamSpec.numProjection > 0) { - array_scale_f32(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + if (rnnParamSpec.zoneout_output != 0) { + if (rnnParamSpec.num_projection > 0) { + array_scale_f32(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneout_output, 0); } else { - array_scale_f32(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + array_scale_f32(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneout_output, 0); } - array_scale_f32(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneoutOutput, 0); + array_scale_f32(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneout_output, 0); array_add_f32(out_f, lastBatchH, currentBatchH, hDim); } else { - memcpy(currentBatchH, currentOutput, sizeof(F32) * hDim); + UNI_MEMCPY(currentBatchH, currentOutput, sizeof(F32) * hDim); } } return SUCCESS; diff --git a/compute/tensor/src/cpu/arm/fp32/normalization.cpp b/compute/tensor/src/cpu/arm/fp32/normalization.cpp index 6604b485..3c4ac5bc 100644 --- a/compute/tensor/src/cpu/arm/fp32/normalization.cpp +++ b/compute/tensor/src/cpu/arm/fp32/normalization.cpp @@ -14,10 +14,11 @@ #include #include "cpu/arm/fp32/tensor_computing_fp32.h" -inline void array_norm_scale_fp32( +static float eps = 1e-6; + +inline static void array_norm_scale_fp32( F32 *input, F32 *output, I32 len, F32 mean, F32 var, F32 *alpha, F32 *beta) { - F32 eps = 1e-6; F32 std_value = sqrt(var + eps); float32x4_t mean_v = vdupq_n_f32(mean); float32x4_t std_v = vdupq_n_f32(std_value); @@ -38,14 +39,10 @@ inline void array_norm_scale_fp32( } } -EE layer_normalization_fp32( +static EE layer_normalization_nhwc( TensorDesc inputDesc, F32 *input, F32 *alpha, F32 *beta, TensorDesc outputDesc, F32 *output) { UNUSED(outputDesc); - if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } - U32 size = tensorNumElements(inputDesc); I32 size_inner = inputDesc.dims[0]; I32 size_outer = size / size_inner; @@ -57,6 +54,87 @@ EE layer_normalization_fp32( array_norm_scale_fp32(current_input, current_output, size_inner, mean, var, alpha, beta); } + return SUCCESS; +} + +static EE layer_normalization_nchwc8( + TensorDesc inputDesc, F32 *input, F32 *alpha, F32 *beta, TensorDesc outputDesc, F32 *output) +{ + UNUSED(outputDesc); + int n = inputDesc.dims[inputDesc.nDims - 1]; + int c = inputDesc.dims[inputDesc.nDims - 2]; + int hw = 1; + for (unsigned int i = 0; i < inputDesc.nDims - 2; i++) { + hw *= inputDesc.dims[i]; + } + int c8 = c / 8; + for (int i = 0; i < n; i++) { + for (int j = 0; j < hw; j++) { + float32x4_t sum_v = vdupq_n_f32(0); + for (int k = 0; k < c8; k++) { + int id = ((i * c8 + k) * hw + j) * 8; + sum_v = vaddq_f32(sum_v, vld1q_f32(input + id)); + sum_v = vaddq_f32(sum_v, vld1q_f32(input + id + 4)); + } + F32 mean = vaddvq_f32(sum_v) / c; + float32x4_t mean_v = vdupq_n_f32(mean); + sum_v = vdupq_n_f32(0); + for (int k = 0; k < c8; k++) { + int id = ((i * c8 + k) * hw + j) * 8; + float32x4_t tmp_v = vsubq_f32(vld1q_f32(input + id), mean_v); + sum_v = vfmaq_f32(sum_v, tmp_v, tmp_v); + tmp_v = vsubq_f32(vld1q_f32(input + id + 4), mean_v); + sum_v = vfmaq_f32(sum_v, tmp_v, tmp_v); + } + F32 var = vaddvq_f32(sum_v) / c; + F32 std_value = sqrt(var + eps); + + float32x4_t std_v = vdupq_n_f32(std_value); + for (int k = 0, kk = 0; k < c8; k++, kk += 8) { + int id = ((i * c8 + k) * hw + j) * 8; + float32x4_t in = vld1q_f32(input + id); + float32x4_t alpha_v = vld1q_f32(alpha + kk); + float32x4_t beta_v = vld1q_f32(beta + kk); + float32x4_t tmp_v = vsubq_f32(in, mean_v); + tmp_v = vdivq_f32(tmp_v, std_v); + tmp_v = vfmaq_f32(beta_v, alpha_v, tmp_v); + vst1q_f32(output + id, tmp_v); + + in = vld1q_f32(input + id + 4); + alpha_v = vld1q_f32(alpha + kk + 4); + beta_v = vld1q_f32(beta + kk + 4); + tmp_v = vsubq_f32(in, mean_v); + tmp_v = vdivq_f32(tmp_v, std_v); + tmp_v = vfmaq_f32(beta_v, alpha_v, tmp_v); + vst1q_f32(output + id + 4, tmp_v); + } + } + } return SUCCESS; } + +EE layer_normalization_fp32(TensorDesc inputDesc, + F32 *input, + LayerNormParamSpec p, + F32 *alpha, + F32 *beta, + TensorDesc outputDesc, + F32 *output) +{ + if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + EE ret = NOT_SUPPORTED; + if (inputDesc.df == DF_NCHWC8) { + if (p.axis == 1) { + ret = layer_normalization_nchwc8(inputDesc, input, alpha, beta, outputDesc, output); + } + } else { + if (p.axis == -1) { + ret = layer_normalization_nhwc(inputDesc, input, alpha, beta, outputDesc, output); + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/fp32/pooling.cpp b/compute/tensor/src/cpu/arm/fp32/pooling.cpp index db302d93..dc165ae8 100644 --- a/compute/tensor/src/cpu/arm/fp32/pooling.cpp +++ b/compute/tensor/src/cpu/arm/fp32/pooling.cpp @@ -94,6 +94,7 @@ EE pooling_bp_c8_fp32(const F32 *input, int hend, int wstart, int wend, + int pool, F32 *output, U32 stride, PoolingParamSpec poolingParamSpec) @@ -103,7 +104,7 @@ EE pooling_bp_c8_fp32(const F32 *input, if (pm != POOLING_MEAN) { ret = NOT_SUPPORTED; } - float32x4_t poolSize = vdupq_n_f32((hend - hstart) * (wend - wstart)); + float32x4_t poolSize = vdupq_n_f32(pool); float32x4_t in0 = vdivq_f32(vld1q_f32(input), poolSize); float32x4_t in1 = vdivq_f32(vld1q_f32(input + 4), poolSize); for (int kernelH = hstart; kernelH < hend; kernelH++) { diff --git a/compute/tensor/src/cpu/arm/fp32/scale.cpp b/compute/tensor/src/cpu/arm/fp32/scale.cpp index 394b3be7..172cd8a1 100644 --- a/compute/tensor/src/cpu/arm/fp32/scale.cpp +++ b/compute/tensor/src/cpu/arm/fp32/scale.cpp @@ -42,28 +42,39 @@ EE scale_nchwc8_fp32( return SUCCESS; } +template EE scale_nchw_fp32( F32 *input, F32 *alpha, F32 *beta, I32 in, I32 ic, I32 elements_per_channel, F32 *output) { float32x4_t one = vdupq_n_f32(1.); float32x4_t zero = vdupq_n_f32(0.); - U32 index = 0; + U32 dst = 0, src = 0; for (I32 n = 0; n < in; n++) { for (I32 c = 0; c < ic; c++) { float32x4_t alpha_vec = (alpha == nullptr) ? one : vdupq_n_f32(alpha[c]); float32x4_t beta_vec = (beta == nullptr) ? zero : vdupq_n_f32(beta[c]); I32 i = 0; for (; i < elements_per_channel - 3; i += 4) { - float32x4_t in_vec = vld1q_f32(input + index); + if (icoc_equal) { + src = (n * ic + c) * elements_per_channel + i; + } else { + src = n * elements_per_channel + i; + } + float32x4_t in_vec = vld1q_f32(input + src); float32x4_t out_vec = vfmaq_f32(beta_vec, alpha_vec, in_vec); - vst1q_f32(output + index, out_vec); - index += 4; + vst1q_f32(output + dst, out_vec); + dst += 4; } for (; i < elements_per_channel; i++) { + if (icoc_equal) { + src = (n * ic + c) * elements_per_channel + i; + } else { + src = n * elements_per_channel + i; + } float alpha_s = (alpha == nullptr) ? 1 : alpha[c]; float beta_s = (beta == nullptr) ? 0 : beta[c]; - output[index] = alpha_s * input[index] + beta_s; - index++; + output[dst] = alpha_s * input[src] + beta_s; + dst++; } } } @@ -126,7 +137,11 @@ EE scale_fp32(F32 *input, EE ret = SUCCESS; // If oc is 1, it means that weights/vectors have only one param, so we need use the calculation logic of nchw. if (axis == 1 || axis == 0 || oc == 1) { - ret = scale_nchw_fp32(input, alpha, beta, on, oc, elements_per_channel, output); + if (ic == oc) { + ret = scale_nchw_fp32(input, alpha, beta, on, oc, elements_per_channel, output); + } else { + ret = scale_nchw_fp32(input, alpha, beta, on, oc, elements_per_channel, output); + } } else if (axis == nDims - 1) { if (ic == oc) { ret = scale_nhwc_fp32(input, alpha, beta, on, oc, elements_per_channel, output); diff --git a/compute/tensor/src/cpu/arm/fp32/softmax.cpp b/compute/tensor/src/cpu/arm/fp32/softmax.cpp index f352e428..f874d264 100644 --- a/compute/tensor/src/cpu/arm/fp32/softmax.cpp +++ b/compute/tensor/src/cpu/arm/fp32/softmax.cpp @@ -14,59 +14,76 @@ #include "cpu/arm/fp32/tensor_computing_fp32.h" #include "tensor_transpose.h" -void softmax_lastAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, F32 *output) +template +static void softmax_lastAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, F32 *output) { for (I32 i = 0; i < loopOuter; i++) { const F32 *inputPtr = input + i * loops; F32 *outputPtr = output + i * loops; - float32x4_t max_v, sub_v, sum_v, tmp_v; + float32x4_t max_v, tmp_v; F32 max_s, tmp_s; - array_minmax_value_f32(inputPtr, loops, 2, &max_s); - max_v = vdupq_n_f32(max_s); - sum_v = vdupq_n_f32(0); - + if (!logsoftmax) { + array_minmax_value_f32(inputPtr, loops, 2, &max_s); + max_v = vdupq_n_f32(max_s); + } I32 j = 0; - F32 sum_s = 0; - for (j = 0; j < loops - 3; j += 4) { + float32x4_t sum_v = vdupq_n_f32(0); + for (; j < loops - 3; j += 4) { float32x4_t in = vld1q_f32(inputPtr + j); - sub_v = vsubq_f32(in, max_v); - tmp_v = vexpq_f32_03_percent_error(sub_v); + if (!logsoftmax) { + in = vsubq_f32(in, max_v); + } + tmp_v = vexpq_f32_03_percent_error(in); sum_v = vaddq_f32(sum_v, tmp_v); - vst1q_f32(outputPtr + j, tmp_v); + if (!logsoftmax) { + vst1q_f32(outputPtr + j, tmp_v); + } } - sum_s += vaddvq_f32(sum_v); + F32 sum_s = vaddvq_f32(sum_v); for (; j < loops; j++) { - tmp_s = exp(inputPtr[j] - max_s); - outputPtr[j] = tmp_s; + if (logsoftmax) { + tmp_s = exp(inputPtr[j]); + } else { + tmp_s = exp(inputPtr[j] - max_s); + outputPtr[j] = tmp_s; + } sum_s += tmp_s; } - array_scale_f32(outputPtr, outputPtr, loops, 1.0 / sum_s, 0); + if (logsoftmax) { + array_scale_f32(inputPtr, outputPtr, loops, 1.0, -log(sum_s)); + } else { + array_scale_f32(outputPtr, outputPtr, loops, 1.0 / sum_s, 0); + } } } +template void softmax_anyAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, I32 loopInner, F32 *output) { std::vector buffer(loopInner * 2); F32 *maxBuffer = &buffer[0]; F32 *sumBuffer = &buffer[loopInner]; I32 k = 0; + F32 tmp_s; for (I32 i = 0; i < loopOuter; i++) { const F32 *inputPtrBase = input + i * loops * loopInner; F32 *outputPtrBase = output + i * loops * loopInner; - memcpy(maxBuffer, inputPtrBase, loopInner * sizeof(F32)); - memset(sumBuffer, 0, loopInner * sizeof(F32)); - for (I32 j = 1; j < loops; j++) { - const F32 *inputPtr = inputPtrBase + j * loopInner; - for (k = 0; k < loopInner - 3; k += 4) { - float32x4_t in_v = vld1q_f32(inputPtr + k); - float32x4_t out_v = vld1q_f32(maxBuffer + k); - float32x4_t max_v = vmaxq_f32(in_v, out_v); - vst1q_f32(maxBuffer + k, max_v); - } - for (; k < loopInner; k++) { - maxBuffer[k] = UNI_MAX(maxBuffer[k], inputPtr[k]); + UNI_MEMSET(sumBuffer, 0, loopInner * sizeof(F32)); + if (!logsoftmax) { + UNI_MEMCPY(maxBuffer, inputPtrBase, loopInner * sizeof(F32)); + for (I32 j = 1; j < loops; j++) { + const F32 *inputPtr = inputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 3; k += 4) { + float32x4_t in_v = vld1q_f32(inputPtr + k); + float32x4_t out_v = vld1q_f32(maxBuffer + k); + float32x4_t max_v = vmaxq_f32(in_v, out_v); + vst1q_f32(maxBuffer + k, max_v); + } + for (; k < loopInner; k++) { + maxBuffer[k] = UNI_MAX(maxBuffer[k], inputPtr[k]); + } } } for (I32 j = 0; j < loops; j++) { @@ -74,35 +91,69 @@ void softmax_anyAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, I32 loopIn F32 *outputPtr = outputPtrBase + j * loopInner; for (k = 0; k < loopInner - 3; k += 4) { float32x4_t in_v = vld1q_f32(inputPtr + k); - float32x4_t max_v = vld1q_f32(maxBuffer + k); - float32x4_t sub_v = vsubq_f32(in_v, max_v); - float32x4_t exp_v = vexpq_f32_03_percent_error(sub_v); + if (!logsoftmax) { + in_v = vsubq_f32(in_v, vld1q_f32(maxBuffer + k)); + } + float32x4_t exp_v = vexpq_f32_03_percent_error(in_v); float32x4_t sum_v = vld1q_f32(sumBuffer + k); sum_v = vaddq_f32(sum_v, exp_v); vst1q_f32(sumBuffer + k, sum_v); - vst1q_f32(outputPtr + k, exp_v); + if (!logsoftmax) { + vst1q_f32(outputPtr + k, exp_v); + } } for (; k < loopInner; k++) { - outputPtr[k] = exp(inputPtr[k] - maxBuffer[k]); - sumBuffer[k] += outputPtr[k]; + if (logsoftmax) { + tmp_s = exp(inputPtr[k]); + } else { + tmp_s = exp(inputPtr[k] - maxBuffer[k]); + outputPtr[k] = tmp_s; + } + sumBuffer[k] += tmp_s; } } - for (I32 j = 0; j < loops; j++) { - F32 *outputPtr = outputPtrBase + j * loopInner; + if (logsoftmax) { for (k = 0; k < loopInner - 3; k += 4) { - float32x4_t out_v = vld1q_f32(outputPtr + k); float32x4_t sum_v = vld1q_f32(sumBuffer + k); - out_v = vdivq_f32(out_v, sum_v); - vst1q_f32(outputPtr + k, out_v); + sum_v = vlogq_f32(sum_v); + vst1q_f32(sumBuffer + k, sum_v); } for (; k < loopInner; k++) { - outputPtr[k] /= sumBuffer[k]; + sumBuffer[k] = log(sumBuffer[k]); + } + for (I32 j = 0; j < loops; j++) { + const F32 *inputPtr = inputPtrBase + j * loopInner; + F32 *outputPtr = outputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 3; k += 4) { + float32x4_t out_v = vld1q_f32(inputPtr + k); + float32x4_t sum_v = vld1q_f32(sumBuffer + k); + out_v = vsubq_f32(out_v, sum_v); + vst1q_f32(outputPtr + k, out_v); + } + for (; k < loopInner; k++) { + outputPtr[k] -= sumBuffer[k]; + } + } + } else { + for (I32 j = 0; j < loops; j++) { + F32 *outputPtr = outputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 3; k += 4) { + float32x4_t out_v = vld1q_f32(outputPtr + k); + float32x4_t sum_v = vld1q_f32(sumBuffer + k); + out_v = vdivq_f32(out_v, sum_v); + vst1q_f32(outputPtr + k, out_v); + } + for (; k < loopInner; k++) { + outputPtr[k] /= sumBuffer[k]; + } } } } } -EE softmax_fp32(TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output) +template +static EE softmax_kernel( + TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output) { UNUSED(outputDesc); if (nullptr == input || nullptr == output) { @@ -145,9 +196,20 @@ EE softmax_fp32(TensorDesc inputDesc, const F32 *input, int axis, TensorDesc out } U32 loop_outer = size / loops / loop_inner; if (axis == 0) { - softmax_lastAxis_fp32(input, loop_outer, loops, output); + softmax_lastAxis_fp32(input, loop_outer, loops, output); } else { - softmax_anyAxis_fp32(input, loop_outer, loops, loop_inner, output); + softmax_anyAxis_fp32(input, loop_outer, loops, loop_inner, output); } return SUCCESS; } + +EE softmax_fp32(TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output) +{ + return softmax_kernel(inputDesc, input, axis, outputDesc, output); +} + +EE logsoftmax_fp32( + TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output) +{ + return softmax_kernel(inputDesc, input, axis, outputDesc, output); +} diff --git a/compute/tensor/src/cpu/arm/fp32/tensor_computing_fp32.h b/compute/tensor/src/cpu/arm/fp32/tensor_computing_fp32.h index 01cc8eb8..9391c04c 100644 --- a/compute/tensor/src/cpu/arm/fp32/tensor_computing_fp32.h +++ b/compute/tensor/src/cpu/arm/fp32/tensor_computing_fp32.h @@ -55,8 +55,8 @@ EE convolution_gemm_V8(TensorDesc inputDesc, TensorDesc outputDesc, F32 *outArray, ActivationParamSpec activationDesc); -#else -EE convolution_gemm_V7(TensorDesc inputDesc, + +EE convolution_gemm_icnchw_V8(TensorDesc inputDesc, F32 *inArray, TensorDesc filterDesc, const F32 *filterArray, @@ -68,10 +68,8 @@ EE convolution_gemm_V7(TensorDesc inputDesc, TensorDesc outputDesc, F32 *outArray, ActivationParamSpec activationDesc); -#endif - -#ifdef __aarch64__ -EE convolution_gemm_icnchw_V8(TensorDesc inputDesc, +#else +EE convolution_gemm_V7(TensorDesc inputDesc, F32 *inArray, TensorDesc filterDesc, const F32 *filterArray, @@ -83,7 +81,7 @@ EE convolution_gemm_icnchw_V8(TensorDesc inputDesc, TensorDesc outputDesc, F32 *outArray, ActivationParamSpec activationDesc); -#else + EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, F32 *inArray, TensorDesc filterDesc, @@ -138,6 +136,7 @@ EE pooling_bp_c8_fp32(const F32 *input, int hend, int wstart, int wend, + int poolSize, F32 *output, U32 stride, PoolingParamSpec poolingParamSpec); @@ -145,6 +144,9 @@ EE pooling_bp_c8_fp32(const F32 *input, EE softmax_fp32( TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output); +EE logsoftmax_fp32( + TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output); + EE concat_fp32(std::vector inputDesc, std::vector input, TensorDesc outputDesc, @@ -243,8 +245,13 @@ EE power_fp32(TensorDesc inputDesc, TensorDesc outputDesc, F32 *output); -EE layer_normalization_fp32( - TensorDesc inputDesc, F32 *input, F32 *alpha, F32 *beta, TensorDesc outputDesc, F32 *output); +EE layer_normalization_fp32(TensorDesc inputDesc, + F32 *input, + LayerNormParamSpec p, + F32 *alpha, + F32 *beta, + TensorDesc outputDesc, + F32 *output); EE scale_fp32(F32 *input, I32 axis, diff --git a/compute/tensor/src/cpu/arm/fp32/v7/convolution_gemm_V7.cpp b/compute/tensor/src/cpu/arm/fp32/v7/convolution_gemm_V7.cpp index 2f121fca..b27091ff 100644 --- a/compute/tensor/src/cpu/arm/fp32/v7/convolution_gemm_V7.cpp +++ b/compute/tensor/src/cpu/arm/fp32/v7/convolution_gemm_V7.cpp @@ -42,7 +42,7 @@ EE convolution_gemm_V7(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); it = ft = ot = 1; p.dilatedRate_t = p.stride_t = 1; - p.padding_before = p.padding_after = 0; + p.pad_before = p.pad_after = 0; } else if (tensorIs5d(inputDesc)) { CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw)); CHECK_STATUS(tensor5dGet(filterDesc, &fdt, &fdf, &fn, &fc, &ft, &fh, &fw)); @@ -56,9 +56,9 @@ EE convolution_gemm_V7(TensorDesc inputDesc, } oc /= 8; - U32 it_pad = it + p.padding_before + p.padding_after; - U32 ih_pad = ih + p.padding_top + p.padding_bottom; - U32 iw_pad = iw + p.padding_left + p.padding_right; + U32 it_pad = it + p.pad_before + p.pad_after; + U32 ih_pad = ih + p.pad_top + p.pad_bottom; + U32 iw_pad = iw + p.pad_left + p.pad_right; I64 K = ic * ft * fh * fw; I32 ohow = ot * oh * ow; F32 *in_pack = ((F32 *)tmp) + ic * it_pad * ih_pad * iw_pad; @@ -116,53 +116,52 @@ EE convolution_gemm_V7(TensorDesc inputDesc, // NHWChw6 F32 *in_pack_c8hw6 = thread_in_pack + (id * params[0] + c) * 8 * 6; - __asm__ __volatile__("vld1.f32 {d0-d3}, [%[in_0]]\n" - "vld1.f32 {d4-d7}, [%[in_1]]\n" - "vld1.f32 {d8-d11}, [%[in_2]]\n" - "vld1.f32 {d12-d15}, [%[in_3]]\n" - "vld1.f32 {d16-d19}, [%[in_4]]\n" - "vld1.f32 {d20-d23}, [%[in_5]]\n" - - "vzip.32 q0, q2\n" - "vzip.32 q4, q6\n" - "vzip.32 q8, q10\n" - - "vst1.f32 {d0}, [%[pack]]!\n" - "vst1.f32 {d8}, [%[pack]]!\n" - "vst1.f32 {d16}, [%[pack]]!\n" - "vst1.f32 {d1}, [%[pack]]!\n" - "vst1.f32 {d9}, [%[pack]]!\n" - "vst1.f32 {d17}, [%[pack]]!\n" - "vst1.f32 {d4}, [%[pack]]!\n" - "vst1.f32 {d12}, [%[pack]]!\n" - "vst1.f32 {d20}, [%[pack]]!\n" - "vst1.f32 {d5}, [%[pack]]!\n" - "vst1.f32 {d13}, [%[pack]]!\n" - "vst1.f32 {d21}, [%[pack]]!\n" - - "vzip.32 q1, q3\n" - "vzip.32 q5, q7\n" - "vzip.32 q9, q11\n" - - "vst1.f32 {d2}, [%[pack]]!\n" - "vst1.f32 {d10}, [%[pack]]!\n" - "vst1.f32 {d18}, [%[pack]]!\n" - "vst1.f32 {d3}, [%[pack]]!\n" - "vst1.f32 {d11}, [%[pack]]!\n" - "vst1.f32 {d19}, [%[pack]]!\n" - "vst1.f32 {d6}, [%[pack]]!\n" - "vst1.f32 {d14}, [%[pack]]!\n" - "vst1.f32 {d22}, [%[pack]]!\n" - "vst1.f32 {d7}, [%[pack]]!\n" - "vst1.f32 {d15}, [%[pack]]!\n" - "vst1.f32 {d23}, [%[pack]]!\n" - : [pack] "+r"(in_pack_c8hw6), [in_0] "+r"(in_0), - [in_1] "+r"(in_1), [in_2] "+r"(in_2), - [in_3] "+r"(in_3), [in_4] "+r"(in_4), - [in_5] "+r"(in_5) - : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", - "q5", "q6", "q7", "q8", "q9", "q10", "q11"); + __asm__ __volatile__( + "vld1.f32 {d0-d3}, [%[in_0]]\n" + "vld1.f32 {d4-d7}, [%[in_1]]\n" + "vld1.f32 {d8-d11}, [%[in_2]]\n" + "vld1.f32 {d12-d15}, [%[in_3]]\n" + "vld1.f32 {d16-d19}, [%[in_4]]\n" + "vld1.f32 {d20-d23}, [%[in_5]]\n" + + "vzip.32 q0, q2\n" + "vzip.32 q4, q6\n" + "vzip.32 q8, q10\n" + + "vst1.f32 {d0}, [%[pack]]!\n" + "vst1.f32 {d8}, [%[pack]]!\n" + "vst1.f32 {d16}, [%[pack]]!\n" + "vst1.f32 {d1}, [%[pack]]!\n" + "vst1.f32 {d9}, [%[pack]]!\n" + "vst1.f32 {d17}, [%[pack]]!\n" + "vst1.f32 {d4}, [%[pack]]!\n" + "vst1.f32 {d12}, [%[pack]]!\n" + "vst1.f32 {d20}, [%[pack]]!\n" + "vst1.f32 {d5}, [%[pack]]!\n" + "vst1.f32 {d13}, [%[pack]]!\n" + "vst1.f32 {d21}, [%[pack]]!\n" + + "vzip.32 q1, q3\n" + "vzip.32 q5, q7\n" + "vzip.32 q9, q11\n" + + "vst1.f32 {d2}, [%[pack]]!\n" + "vst1.f32 {d10}, [%[pack]]!\n" + "vst1.f32 {d18}, [%[pack]]!\n" + "vst1.f32 {d3}, [%[pack]]!\n" + "vst1.f32 {d11}, [%[pack]]!\n" + "vst1.f32 {d19}, [%[pack]]!\n" + "vst1.f32 {d6}, [%[pack]]!\n" + "vst1.f32 {d14}, [%[pack]]!\n" + "vst1.f32 {d22}, [%[pack]]!\n" + "vst1.f32 {d7}, [%[pack]]!\n" + "vst1.f32 {d15}, [%[pack]]!\n" + "vst1.f32 {d23}, [%[pack]]!\n" + : [pack] "+r"(in_pack_c8hw6), [in_0] "+r"(in_0), [in_1] "+r"(in_1), + [in_2] "+r"(in_2), [in_3] "+r"(in_3), [in_4] "+r"(in_4), [in_5] "+r"(in_5) + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11"); } } } diff --git a/compute/tensor/src/cpu/arm/fp32/v7/convolution_gemm_icnchw_V7.cpp b/compute/tensor/src/cpu/arm/fp32/v7/convolution_gemm_icnchw_V7.cpp index 2bf21d2f..c35ddb1a 100644 --- a/compute/tensor/src/cpu/arm/fp32/v7/convolution_gemm_icnchw_V7.cpp +++ b/compute/tensor/src/cpu/arm/fp32/v7/convolution_gemm_icnchw_V7.cpp @@ -42,7 +42,7 @@ EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); it = ft = ot = 1; p.dilatedRate_t = p.stride_t = 1; - p.padding_before = p.padding_after = 0; + p.pad_before = p.pad_after = 0; } else if (tensorIs5d(inputDesc)) { CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw)); CHECK_STATUS(tensor5dGet(filterDesc, &fdt, &fdf, &fn, &fc, &ft, &fh, &fw)); @@ -66,9 +66,9 @@ EE convolution_gemm_icnchw_V7(TensorDesc inputDesc, return NOT_SUPPORTED; } oc /= 8; - U32 it_pad = it + p.padding_before + p.padding_after; - U32 ih_pad = ih + p.padding_top + p.padding_bottom; - U32 iw_pad = iw + p.padding_left + p.padding_right; + U32 it_pad = it + p.pad_before + p.pad_after; + U32 ih_pad = ih + p.pad_top + p.pad_bottom; + U32 iw_pad = iw + p.pad_left + p.pad_right; I64 K = ic * ft * fh * fw; I32 ohow = ot * oh * ow; F32 *in_pack = ((F32 *)tmp) + ic * it_pad * ih_pad * iw_pad; diff --git a/compute/tensor/src/cpu/arm/fp32/v7/depthwise_pointwise_convolution_direct_V7.cpp b/compute/tensor/src/cpu/arm/fp32/v7/depthwise_pointwise_convolution_direct_V7.cpp index 60e19d3b..1e2aa034 100644 --- a/compute/tensor/src/cpu/arm/fp32/v7/depthwise_pointwise_convolution_direct_V7.cpp +++ b/compute/tensor/src/cpu/arm/fp32/v7/depthwise_pointwise_convolution_direct_V7.cpp @@ -45,10 +45,10 @@ EE depthwise_pointwise_convolution_direct_V7(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 dilateH = convParamSpec.dilatedRate_h; U32 dilateW = convParamSpec.dilatedRate_w; @@ -75,20 +75,20 @@ EE depthwise_pointwise_convolution_direct_V7(TensorDesc inputDesc, F32 *inArray_mov = inArray + n * ic * ihiw * 8; for (U32 c = 0; c < ic; c++) { if (paddingT > 0) { - memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt)); inArray_pad_mov += paddingT * iw_pad * 8; } for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt)); inArray_pad_mov += paddingL * 8; - memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt)); + UNI_MEMCPY(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt)); inArray_pad_mov += iw * 8; inArray_mov += iw * 8; - memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt)); inArray_pad_mov += paddingR * 8; } if (paddingB > 0) { - memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt)); inArray_pad_mov += paddingB * iw_pad * 8; } diff --git a/compute/tensor/src/cpu/arm/fp32/v8/convolution_gemm_V8.cpp b/compute/tensor/src/cpu/arm/fp32/v8/convolution_gemm_V8.cpp index 846b844c..33391996 100644 --- a/compute/tensor/src/cpu/arm/fp32/v8/convolution_gemm_V8.cpp +++ b/compute/tensor/src/cpu/arm/fp32/v8/convolution_gemm_V8.cpp @@ -42,7 +42,7 @@ EE convolution_gemm_V8(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); it = ft = ot = 1; p.dilatedRate_t = p.stride_t = 1; - p.padding_before = p.padding_after = 0; + p.pad_before = p.pad_after = 0; } else if (tensorIs5d(inputDesc)) { CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw)); CHECK_STATUS(tensor5dGet(filterDesc, &fdt, &fdf, &fn, &fc, &ft, &fh, &fw)); @@ -56,9 +56,9 @@ EE convolution_gemm_V8(TensorDesc inputDesc, } oc /= 8; - U32 it_pad = it + p.padding_before + p.padding_after; - U32 ih_pad = ih + p.padding_top + p.padding_bottom; - U32 iw_pad = iw + p.padding_left + p.padding_right; + U32 it_pad = it + p.pad_before + p.pad_after; + U32 ih_pad = ih + p.pad_top + p.pad_bottom; + U32 iw_pad = iw + p.pad_left + p.pad_right; I64 K = ic * ft * fh * fw; I32 ohow = ot * oh * ow; F32 *in_pack = ((F32 *)tmp) + ic * it_pad * ih_pad * iw_pad; @@ -216,9 +216,8 @@ EE convolution_gemm_V8(TensorDesc inputDesc, : : [pack] "r"(in_pack_c8hw12), [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3), [in_4] "r"(in_4), - [in_5] "r"(in_5), [in_6] "r"(in_6), [in_7] "r"(in_7), - [in_8] "r"(in_8), [in_9] "r"(in_9), [in_10] "r"(in_10), - [in_11] "r"(in_11) + [in_5] "r"(in_5), [in_6] "r"(in_6), [in_7] "r"(in_7), [in_8] "r"(in_8), + [in_9] "r"(in_9), [in_10] "r"(in_10), [in_11] "r"(in_11) : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", @@ -756,8 +755,7 @@ EE convolution_gemm_V8(TensorDesc inputDesc, "st4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[pack]], #64\n" "st4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[pack]]\n" : [pack] "+r"(in_pack_c8hw4) - : [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), - [in_3] "r"(in_3) + : [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3) : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } } diff --git a/compute/tensor/src/cpu/arm/fp32/v8/convolution_gemm_icnchw_V8.cpp b/compute/tensor/src/cpu/arm/fp32/v8/convolution_gemm_icnchw_V8.cpp index a6b72066..5f568202 100644 --- a/compute/tensor/src/cpu/arm/fp32/v8/convolution_gemm_icnchw_V8.cpp +++ b/compute/tensor/src/cpu/arm/fp32/v8/convolution_gemm_icnchw_V8.cpp @@ -42,7 +42,7 @@ EE convolution_gemm_icnchw_V8(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); it = ft = ot = 1; p.dilatedRate_t = p.stride_t = 1; - p.padding_before = p.padding_after = 0; + p.pad_before = p.pad_after = 0; } else if (tensorIs5d(inputDesc)) { CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw)); CHECK_STATUS(tensor5dGet(filterDesc, &fdt, &fdf, &fn, &fc, &ft, &fh, &fw)); @@ -55,9 +55,9 @@ EE convolution_gemm_icnchw_V8(TensorDesc inputDesc, } oc /= 8; - U32 it_pad = it + p.padding_before + p.padding_after; - U32 ih_pad = ih + p.padding_top + p.padding_bottom; - U32 iw_pad = iw + p.padding_left + p.padding_right; + U32 it_pad = it + p.pad_before + p.pad_after; + U32 ih_pad = ih + p.pad_top + p.pad_bottom; + U32 iw_pad = iw + p.pad_left + p.pad_right; I64 K = ic * ft * fh * fw; I32 ohow = ot * oh * ow; F32 *in_pack = ((F32 *)tmp) + ic * it_pad * ih_pad * iw_pad; diff --git a/compute/tensor/src/cpu/arm/fp32/v8/convolution_winograd_V8.cpp b/compute/tensor/src/cpu/arm/fp32/v8/convolution_winograd_V8.cpp index fe479f40..b2e375c4 100644 --- a/compute/tensor/src/cpu/arm/fp32/v8/convolution_winograd_V8.cpp +++ b/compute/tensor/src/cpu/arm/fp32/v8/convolution_winograd_V8.cpp @@ -40,10 +40,10 @@ EE convolution_winograd_V8(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; if (fdf != DF_HWNCN8) { CHECK_STATUS(NOT_MATCH); @@ -78,8 +78,8 @@ EE convolution_winograd_V8(TensorDesc inputDesc, EE ret = SUCCESS; // copy input into a input with padding for (U32 n = 0; n < in; n++) { - convParamSpec.padding_bottom = pad_bottom; - convParamSpec.padding_right = pad_right; + convParamSpec.pad_bottom = pad_bottom; + convParamSpec.pad_right = pad_right; F32 *inArray_pad = convolution_input_padding_per_channel( n, ic, 1, ih, iw, convParamSpec, inArray, (F32 *)tmp); @@ -796,7 +796,7 @@ EE convolution_winograd_V8(TensorDesc inputDesc, trans_I_4x4_3x3(Iw_ptr1, I1); for (U32 i = 0; i < 36; i++) { F32 *itm = itmArray_mov + i * ic * 8; - memcpy(itm, Iw[i], 8 * bytesOf(idt)); + UNI_MEMCPY(itm, Iw[i], 8 * bytesOf(idt)); } } for (I32 o = 0; o < I32(oc); o++) { diff --git a/compute/tensor/src/cpu/arm/fp32/v8/depthwise_pointwise_convolution_direct_V8.cpp b/compute/tensor/src/cpu/arm/fp32/v8/depthwise_pointwise_convolution_direct_V8.cpp index 6e0903ae..8b6ecf06 100644 --- a/compute/tensor/src/cpu/arm/fp32/v8/depthwise_pointwise_convolution_direct_V8.cpp +++ b/compute/tensor/src/cpu/arm/fp32/v8/depthwise_pointwise_convolution_direct_V8.cpp @@ -43,10 +43,10 @@ EE depthwise_pointwise_convolution_direct_V8(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 dilateH = convParamSpec.dilatedRate_h; U32 dilateW = convParamSpec.dilatedRate_w; @@ -70,20 +70,20 @@ EE depthwise_pointwise_convolution_direct_V8(TensorDesc inputDesc, F32 *inArray_mov = inArray + n * ic * ihiw * 8; for (U32 c = 0; c < ic; c++) { if (paddingT > 0) { - memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(fdt)); inArray_pad_mov += paddingT * iw_pad * 8; } for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingL * 8 * bytesOf(fdt)); inArray_pad_mov += paddingL * 8; - memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt)); + UNI_MEMCPY(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(fdt)); inArray_pad_mov += iw * 8; inArray_mov += iw * 8; - memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingR * 8 * bytesOf(fdt)); inArray_pad_mov += paddingR * 8; } if (paddingB > 0) { - memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(fdt)); inArray_pad_mov += paddingB * iw_pad * 8; } @@ -143,41 +143,40 @@ EE depthwise_pointwise_convolution_direct_V8(TensorDesc inputDesc, F32 *in_5 = in_idx + in_h_5 * iw_pad * 8 + in_w_5 * 8; F32 *in_6 = in_idx + in_h_6 * iw_pad * 8 + in_w_6 * 8; F32 *in_7 = in_idx + in_h_7 * iw_pad * 8 + in_w_7 * 8; - __asm__ __volatile__("ldp q16, q17, [%[f0]]\n" - "ldp q30, q31, [%[in0]]\n" - "ldp q18, q19, [%[in1]]\n" - "ldp q20, q21, [%[in2]]\n" - "ldp q22, q23, [%[in3]]\n" - "ldp q24, q25, [%[in4]]\n" - "ldp q26, q27, [%[in5]]\n" - "ldp q28, q29, [%[in6]]\n" - - "fmla v0.4s, v30.4s, v16.4s\n" - "fmla v1.4s, v31.4s, v17.4s\n" - "fmla v2.4s, v18.4s, v16.4s\n" - "ldp q30, q31, [%[in7]]\n" - "fmla v3.4s, v19.4s, v17.4s\n" - "fmla v4.4s, v20.4s, v16.4s\n" - "fmla v5.4s, v21.4s, v17.4s\n" - "fmla v6.4s, v22.4s, v16.4s\n" - "fmla v7.4s, v23.4s, v17.4s\n" - "fmla v8.4s, v24.4s, v16.4s\n" - "fmla v9.4s, v25.4s, v17.4s\n" - "fmla v10.4s, v26.4s, v16.4s\n" - "fmla v11.4s, v27.4s, v17.4s\n" - "fmla v12.4s, v28.4s, v16.4s\n" - "fmla v13.4s, v29.4s, v17.4s\n" - "fmla v14.4s, v30.4s, v16.4s\n" - "fmla v15.4s, v31.4s, v17.4s\n" - : - : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), - [in3] "r"(in_3), [in4] "r"(in_4), [in5] "r"(in_5), - [in6] "r"(in_6), [in7] "r"(in_7), [f0] "r"(f_0) - : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", - "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", - "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", - "v30", "v31"); + __asm__ __volatile__( + "ldp q16, q17, [%[f0]]\n" + "ldp q30, q31, [%[in0]]\n" + "ldp q18, q19, [%[in1]]\n" + "ldp q20, q21, [%[in2]]\n" + "ldp q22, q23, [%[in3]]\n" + "ldp q24, q25, [%[in4]]\n" + "ldp q26, q27, [%[in5]]\n" + "ldp q28, q29, [%[in6]]\n" + + "fmla v0.4s, v30.4s, v16.4s\n" + "fmla v1.4s, v31.4s, v17.4s\n" + "fmla v2.4s, v18.4s, v16.4s\n" + "ldp q30, q31, [%[in7]]\n" + "fmla v3.4s, v19.4s, v17.4s\n" + "fmla v4.4s, v20.4s, v16.4s\n" + "fmla v5.4s, v21.4s, v17.4s\n" + "fmla v6.4s, v22.4s, v16.4s\n" + "fmla v7.4s, v23.4s, v17.4s\n" + "fmla v8.4s, v24.4s, v16.4s\n" + "fmla v9.4s, v25.4s, v17.4s\n" + "fmla v10.4s, v26.4s, v16.4s\n" + "fmla v11.4s, v27.4s, v17.4s\n" + "fmla v12.4s, v28.4s, v16.4s\n" + "fmla v13.4s, v29.4s, v17.4s\n" + "fmla v14.4s, v30.4s, v16.4s\n" + "fmla v15.4s, v31.4s, v17.4s\n" + : + : [in0] "r"(in_0), [in1] "r"(in_1), [in2] "r"(in_2), [in3] "r"(in_3), + [in4] "r"(in_4), [in5] "r"(in_5), [in6] "r"(in_6), [in7] "r"(in_7), [f0] "r"(f_0) + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", + "v29", "v30", "v31"); } } diff --git a/compute/tensor/src/cpu/arm/int32/scale.cpp b/compute/tensor/src/cpu/arm/int32/scale.cpp new file mode 100644 index 00000000..3a8bacbd --- /dev/null +++ b/compute/tensor/src/cpu/arm/int32/scale.cpp @@ -0,0 +1,157 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include +#include "cpu/arm/int32/tensor_computing_int32.h" + +EE scale_nchwc8_int32( + I32 *input, I32 *alpha, I32 *beta, I32 in, I32 ic, I32 elements_per_channel, I32 *output) +{ + int32x4_t in_vec, out_vec; + int32x4_t one = vdupq_n_s32(1); + int32x4_t zero = vdupq_n_s32(0); + U32 index = 0; + for (I32 n = 0; n < in; n++) { + for (I32 c = 0; c < ic; c += 8) { + int32x4_t alpha_vec0 = (alpha == nullptr) ? one : vld1q_s32(alpha + c); + int32x4_t alpha_vec1 = (alpha == nullptr) ? one : vld1q_s32(alpha + c + 4); + int32x4_t beta_vec0 = (beta == nullptr) ? zero : vld1q_s32(beta + c); + int32x4_t beta_vec1 = (beta == nullptr) ? zero : vld1q_s32(beta + c + 4); + for (I32 i = 0; i < elements_per_channel; i++) { + in_vec = vld1q_s32(input + index); + out_vec = vmlaq_s32(beta_vec0, alpha_vec0, in_vec); + vst1q_s32(output + index, out_vec); + + in_vec = vld1q_s32(input + index + 4); + out_vec = vmlaq_s32(beta_vec1, alpha_vec1, in_vec); + vst1q_s32(output + index + 4, out_vec); + index += 8; + } + } + } + return SUCCESS; +} + +template +EE scale_nchw_int32( + I32 *input, I32 *alpha, I32 *beta, I32 in, I32 ic, I32 elements_per_channel, I32 *output) +{ + int32x4_t one = vdupq_n_s32(1); + int32x4_t zero = vdupq_n_s32(0); + U32 dst = 0, src = 0; + for (I32 n = 0; n < in; n++) { + for (I32 c = 0; c < ic; c++) { + int32x4_t alpha_vec = (alpha == nullptr) ? one : vdupq_n_s32(alpha[c]); + int32x4_t beta_vec = (beta == nullptr) ? zero : vdupq_n_s32(beta[c]); + I32 i = 0; + for (; i < elements_per_channel - 3; i += 4) { + if (icoc_equal) { + src = (n * ic + c) * elements_per_channel + i; + } else { + src = n * elements_per_channel + i; + } + int32x4_t in_vec = vld1q_s32(input + src); + int32x4_t out_vec = vmlaq_s32(beta_vec, alpha_vec, in_vec); + vst1q_s32(output + dst, out_vec); + dst += 4; + } + for (; i < elements_per_channel; i++) { + if (icoc_equal) { + src = (n * ic + c) * elements_per_channel + i; + } else { + src = n * elements_per_channel + i; + } + int alpha_s = (alpha == nullptr) ? 1 : alpha[c]; + int beta_s = (beta == nullptr) ? 0 : beta[c]; + output[dst] = alpha_s * input[src] + beta_s; + dst++; + } + } + } + return SUCCESS; +} + +template +EE scale_nhwc_int32( + I32 *input, I32 *alpha, I32 *beta, I32 in, I32 ic, I32 elements_per_channel, I32 *output) +{ + int32x4_t one = vdupq_n_s32(1); + int32x4_t zero = vdupq_n_s32(0); + int32x4_t in_vec; + int in_s; + for (I32 n = 0, src = 0, dst = 0; n < in; n++) { + for (I32 i = 0; i < elements_per_channel; i++, src++) { + I32 c = 0; + for (; c < ic - 3; c += 4) { + int32x4_t alpha_vec = (alpha == nullptr) ? one : vld1q_s32(alpha + c); + int32x4_t beta_vec = (beta == nullptr) ? zero : vld1q_s32(beta + c); + if (icoc_equal) { + in_vec = vld1q_s32(input + dst); + } else { + in_vec = vdupq_n_s32(input[src]); + } + int32x4_t out_vec = vmlaq_s32(beta_vec, alpha_vec, in_vec); + vst1q_s32(output + dst, out_vec); + dst += 4; + } + for (; c < ic; c++) { + int alpha_s = (alpha == nullptr) ? 1 : alpha[c]; + int beta_s = (beta == nullptr) ? 0 : beta[c]; + if (icoc_equal) { + in_s = input[dst]; + } else { + in_s = input[src]; + } + output[dst] = alpha_s * in_s + beta_s; + dst++; + } + } + } + return SUCCESS; +} + +EE scale_int32(I32 *input, + I32 axis, + I32 nDims, + I32 *alpha, + I32 *beta, + I32 on, + I32 oc, + I32 elements_per_channel, + I32 ic, + I32 *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + EE ret = SUCCESS; + // If oc is 1, it means that weights/vectors have only one param, so we need use the calculation logic of nchw. + if (axis == 1 || axis == 0 || oc == 1) { + if (ic == oc) { + ret = scale_nchw_int32(input, alpha, beta, on, oc, elements_per_channel, output); + } else { + ret = scale_nchw_int32(input, alpha, beta, on, oc, elements_per_channel, output); + } + } else if (axis == nDims - 1) { + if (ic == oc) { + ret = scale_nhwc_int32(input, alpha, beta, on, oc, elements_per_channel, output); + } else { + ret = scale_nhwc_int32(input, alpha, beta, on, oc, elements_per_channel, output); + } + } else if (axis == nDims) { + ret = scale_nchwc8_int32(input, alpha, beta, on, oc, elements_per_channel, output); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + return ret; +} diff --git a/compute/tensor/src/cpu/arm/int32/tensor_computing_int32.h b/compute/tensor/src/cpu/arm/int32/tensor_computing_int32.h new file mode 100644 index 00000000..98ae11b2 --- /dev/null +++ b/compute/tensor/src/cpu/arm/int32/tensor_computing_int32.h @@ -0,0 +1,31 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _TENSOR_COMPUTING_INT32_H +#define _TENSOR_COMPUTING_INT32_H + +#include "error.h" + +#include "thread_affinity.h" + +EE scale_int32(I32 *input, + I32 axis, + I32 nDims, + I32 *alpha, + I32 *beta, + I32 on, + I32 oc, + I32 elements_per_channel, + I32 ic, + I32 *output); +#endif diff --git a/compute/tensor/src/cpu/arm/int8/arm_functions_int8.h b/compute/tensor/src/cpu/arm/int8/arm_functions_int8.h index 1b91961a..fb42e2ee 100644 --- a/compute/tensor/src/cpu/arm/int8/arm_functions_int8.h +++ b/compute/tensor/src/cpu/arm/int8/arm_functions_int8.h @@ -14,40 +14,43 @@ #ifndef _H_ARM_FUNCTIONS_INT8 #define _H_ARM_FUNCTIONS_INT8 +#include "cpu/cpu_functions_template.h" #include "arm_neon_expand.h" -#include "parameter_spec.h" inline EE activation_int8(INT8 *input, U32 len, ActivationParamSpec activationDesc, INT8 *output) { - int8x16_t in, out; int8x16_t zero = vdupq_n_s8(0); - U32 len_main = len / 16; - U32 len_tail = len % 16; - + U32 loops = len / 16 * 16; + EE ret = SUCCESS; switch (activationDesc.mode) { case ACTIVATION_NULL: { + if (output != input) { + UNI_MEMCPY(output, input, sizeof(INT8) * len); + } + loops = len; break; } case ACTIVATION_RELU: { if (activationDesc.value[0] != 0) { - return NOT_SUPPORTED; - } - for (U32 i = 0; i < len_main; i++) { - in = vld1q_s8(input); - out = vmaxq_s8(zero, in); - vst1q_s8(output, out); - input += 16; - output += 16; - } - for (U32 i = 0; i < len_tail; i++) { - output[i] = (input[i] < 0) ? 0 : input[i]; + ret = NOT_SUPPORTED; + } else { + for (U32 i = 0; i < loops; i += 16) { + int8x16_t in = vld1q_s8(input + i); + int8x16_t out = vmaxq_s8(zero, in); + vst1q_s8(output + i, out); + } } break; } default: - return NOT_SUPPORTED; + ret = NOT_SUPPORTED; + break; } - - return SUCCESS; + if (ret == SUCCESS) { + for (U32 i = loops; i < len; i++) { + ret = activation_template(activationDesc, input[i], output + i); + } + } + return ret; } #endif diff --git a/compute/tensor/src/cpu/arm/int8/concat.cpp b/compute/tensor/src/cpu/arm/int8/concat.cpp index 9281e180..3bcf72a5 100644 --- a/compute/tensor/src/cpu/arm/int8/concat.cpp +++ b/compute/tensor/src/cpu/arm/int8/concat.cpp @@ -26,7 +26,7 @@ EE concat_int8(std::vector inputDesc, CHECK_STATUS(NOT_MATCH); } if (inputDesc.size() == 1) { - memcpy(output, input[0], tensorNumBytes(outputDesc)); + UNI_MEMCPY(output, input[0], tensorNumBytes(outputDesc)); return SUCCESS; } if (concatDim != 0 && concatDim != 1) { @@ -113,7 +113,7 @@ EE concat_int8(std::vector inputDesc, for (U32 i = 0; i < inputDesc.size(); i++) { copySize = tensorNumElements(inputDesc[i]) * sizeof(INT8); - memcpy(out_ptr, input[i], copySize); + UNI_MEMCPY(out_ptr, input[i], copySize); out_ptr = out_ptr + copySize; } return SUCCESS; @@ -129,7 +129,7 @@ EE concat_int8(std::vector inputDesc, copySize = tensorNumElements(inputDesc[i]) / in * sizeof(INT8); - memcpy(out_ptr, (INT8 *)input[i] + j * copySize, copySize); + UNI_MEMCPY(out_ptr, (INT8 *)input[i] + j * copySize, copySize); out_ptr = out_ptr + copySize; } } diff --git a/compute/tensor/src/cpu/arm/int8/convolution.cpp b/compute/tensor/src/cpu/arm/int8/convolution.cpp index 10adc050..d614b935 100644 --- a/compute/tensor/src/cpu/arm/int8/convolution.cpp +++ b/compute/tensor/src/cpu/arm/int8/convolution.cpp @@ -12,10 +12,10 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "cpu/arm/int8/tensor_computing_int8.h" -#ifdef __aarch64__ -#include "cpu/arm/int8/v8/convolution_winograd.h" -#include "cpu/arm/int8/v8/convolution_gemm.h" -#else +#if defined(_USE_FP16) +#include "cpu/arm/int8/v8.2/convolution_winograd.h" +#include "cpu/arm/int8/v8.2/convolution_gemm.h" +#elif !defined(__aarch64__) #include "cpu/arm/int8/v7/convolution_gemm.h" #endif #include "tensor_transpose.h" @@ -74,23 +74,25 @@ EE convolution_int8(TensorDesc inputDesc, inputPtr = tmpPtr; tmpPtr += tensorNumBytes(inputDesc); tmpBytes -= tensorNumBytes(inputDesc); - algorithm = CONVOLUTION_ALGORITHM_GEMM; + //algorithm = CONVOLUTION_ALGORITHM_GEMM; } EE ret = SUCCESS; switch (algorithm) { -#ifdef __aarch64__ +#if defined(_USE_FP16) case CONVOLUTION_ALGORITHM_WINOGRAD: ret = convolution_winograd(inputDesc, inputPtr, scales, filterDesc, filter, scales + 2, convParamSpec, biasDesc, bias, tmpBytes, tmpPtr, outputDesc, output, scales + 1, activationDesc, arch); break; #endif +#if defined(_USE_FP16) || !defined(__aarch64__) case CONVOLUTION_ALGORITHM_GEMM: ret = convolution_gemm(inputDesc, inputPtr, scales, filterDesc, filter, scales + 2, convParamSpec, biasDesc, bias, tmpBytes, tmpPtr, outputDesc, output, scales + 1, activationDesc, arch); break; +#endif default: ret = NOT_SUPPORTED; break; diff --git a/compute/tensor/src/cpu/arm/int8/convolution_transform.cpp b/compute/tensor/src/cpu/arm/int8/convolution_transform.cpp index 63c0f446..dfd0d90d 100644 --- a/compute/tensor/src/cpu/arm/int8/convolution_transform.cpp +++ b/compute/tensor/src/cpu/arm/int8/convolution_transform.cpp @@ -32,7 +32,7 @@ inline EE convolution_transform_filter_kernel_int8(TensorDesc filterDesc, CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); if (fdf == ftmDataFormat) { *ftmDesc = filterDesc; - memcpy(ftm, filter, fn * fc * fh * fw * bytesOf(fdt)); + UNI_MEMCPY(ftm, filter, fn * fc * fh * fw * bytesOf(fdt)); return SUCCESS; } if (fdf != DF_NCHW) { diff --git a/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.cpp index 04c23b4a..fb01e18a 100644 --- a/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.cpp +++ b/compute/tensor/src/cpu/arm/int8/depthwise_pointwise_convolution.cpp @@ -59,12 +59,14 @@ EE depthwise_pointwise_convolution_int8(TensorDesc inputDesc, EE ret = SUCCESS; switch (algorithm) { +#if defined(_USE_FP16) || !defined(__aarch64__) case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: ret = depthwise_pointwise_convolution_direct(inputDesc, input, dwFilterDesc, dwFilter, pwFilterDesc, pwFilter, convParamSpec, dwBiasDesc, dwBias, pwBiasDesc, pwBias, tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, pointwiseActivationParamSpec, arch); break; +#endif default: ret = NOT_SUPPORTED; break; diff --git a/compute/tensor/src/cpu/arm/int8/v7/convolution_gemm.cpp b/compute/tensor/src/cpu/arm/int8/v7/convolution_gemm.cpp index 29d5ec37..f004f065 100644 --- a/compute/tensor/src/cpu/arm/int8/v7/convolution_gemm.cpp +++ b/compute/tensor/src/cpu/arm/int8/v7/convolution_gemm.cpp @@ -45,10 +45,10 @@ EE convolution_gemm_v7(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); U32 strideH = p.stride_h; U32 strideW = p.stride_w; - U32 paddingT = p.padding_top; - U32 paddingB = p.padding_bottom; - U32 paddingL = p.padding_left; - U32 paddingR = p.padding_right; + U32 paddingT = p.pad_top; + U32 paddingB = p.pad_bottom; + U32 paddingL = p.pad_left; + U32 paddingR = p.pad_right; U32 dilateH = p.dilatedRate_h; U32 dilateW = p.dilatedRate_w; @@ -160,108 +160,108 @@ EE convolution_gemm_v7(TensorDesc inputDesc, I32 *out_buf = biasScaled + oc * 8 + ((n * oc + o) * ohow + hw) * 8; F32 *out_o0hw0 = outArray + ((n * oc + o) * ohow + hw) * 8; #if 1 - asm volatile("cmp %[out_f32], #0\n" - "beq 0f\n" - "vmov.s32 q4, #0.\n" - "vmov.s32 q5, #0.\n" - "vmov.s32 q6, #0.\n" - "vmov.s32 q7, #0.\n" - "vmov.s32 q8, #0.\n" - "vmov.s32 q9, #0.\n" - "vmov.s32 q10, #0.\n" - "vmov.s32 q11, #0.\n" - "b 1f\n" - - "0:\n" - "vld1.s32 {d8-d11}, [%[b0_s]]\n" - "vld1.s32 {d12-d15}, [%[b0_s]]\n" - "vld1.s32 {d16-d19}, [%[b0_s]]\n" - "vld1.s32 {d20-d23}, [%[b0_s]]\n" - - "1:\n" - "vld1.s8 {d0[]}, [%[in]]!\n" - "vld1.s8 {d1[]}, [%[in]]!\n" - "vld1.s8 {d2[]}, [%[in]]!\n" - "vld1.s8 {d3[]}, [%[in]]!\n" - - "vld1.s8 {d4-d5}, [%[w]]!\n" - - // K- > r2 - "mov r2, %[K]\n" - - // Computation loop - "2:\n" - - "vmull.s8 q12, d4, d0\n" - "vld1.s8 {d0[]}, [%[in]]!\n" - "vmull.s8 q13, d4, d1\n" - "vld1.s8 {d1[]}, [%[in]]!\n" - "vmull.s8 q14, d4, d2\n" - "vld1.s8 {d2[]}, [%[in]]!\n" - "vmull.s8 q15, d4, d3\n" - "vld1.s8 {d3[]}, [%[in]]!\n" - "vld1.s8 {d4}, [%[w]]!\n" - - "vmlal.s8 q12, d5, d0\n" - "vmlal.s8 q13, d5, d1\n" - "vld1.s8 {d0[]}, [%[in]]!\n" - "vmlal.s8 q14, d5, d2\n" - "vld1.s8 {d1[]}, [%[in]]!\n" - "vmlal.s8 q15, d5, d3\n" - - //"vaddw.s16 q4, q4, d24\n" - //"vaddw.s16 q5, q5, d25\n" - //"vaddw.s16 q6, q6, d26\n" - //"vaddw.s16 q7, q7, d27\n" - //"vaddw.s16 q8, q8, d28\n" - //"vaddw.s16 q9, q9, d29\n" - //"vaddw.s16 q10, q10, d30\n" - //"vaddw.s16 q11, q11, d31\n" - //"vmov.s32 q12, #0\n" - //"vmov.s32 q13, #0\n" - //"vmov.s32 q14, #0\n" - //"vmov.s32 q15, #0\n" - - "vld1.s8 {d2[]}, [%[in]]!\n" - "vmlal.s8 q12, d4, d0\n" - "vld1.s8 {d3[]}, [%[in]]!\n" - "vld1.s8 {d5}, [%[w]]!\n" - "vmlal.s8 q13, d4, d1\n" - "vld1.s8 {d0[]}, [%[in]]!\n" - "vmlal.s8 q14, d4, d2\n" - "vld1.s8 {d1[]}, [%[in]]!\n" - "vmlal.s8 q15, d4, d3\n" - "vld1.s8 {d2[]}, [%[in]]!\n" - - "vmlal.s8 q12, d5, d0\n" - "vld1.s8 {d3[]}, [%[in]]!\n" - "vld1.s8 {d4}, [%[w]]!\n" - "vmlal.s8 q13, d5, d1\n" - "vld1.s8 {d0[]}, [%[in]]!\n" - "vmlal.s8 q14, d5, d2\n" - "vld1.s8 {d1[]}, [%[in]]!\n" - "vmlal.s8 q15, d5, d3\n" - "vld1.s8 {d2[]}, [%[in]]!\n" - "vld1.s8 {d3[]}, [%[in]]!\n" - "vld1.s8 {d5}, [%[w]]!\n" - - "subs r2, r2, #4\n" - - "vaddw.s16 q4, q4, d24\n" - "vaddw.s16 q5, q5, d25\n" - "vaddw.s16 q6, q6, d26\n" - "vaddw.s16 q7, q7, d27\n" - "vaddw.s16 q8, q8, d28\n" - "vaddw.s16 q9, q9, d29\n" - "vaddw.s16 q10, q10, d30\n" - "vaddw.s16 q11, q11, d31\n" - - "bne 2b\n" - : [in] "+r"(in_hw), [w] "+r"(f_o) - : [K] "r"((I64)(ic * fh * fw * 8)), [b0_s] "r"(b0_s), - [out_f32] "r"(out_f32_bool) - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", - "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r1", "r2"); + asm volatile( + "cmp %[out_f32], #0\n" + "beq 0f\n" + "vmov.s32 q4, #0.\n" + "vmov.s32 q5, #0.\n" + "vmov.s32 q6, #0.\n" + "vmov.s32 q7, #0.\n" + "vmov.s32 q8, #0.\n" + "vmov.s32 q9, #0.\n" + "vmov.s32 q10, #0.\n" + "vmov.s32 q11, #0.\n" + "b 1f\n" + + "0:\n" + "vld1.s32 {d8-d11}, [%[b0_s]]\n" + "vld1.s32 {d12-d15}, [%[b0_s]]\n" + "vld1.s32 {d16-d19}, [%[b0_s]]\n" + "vld1.s32 {d20-d23}, [%[b0_s]]\n" + + "1:\n" + "vld1.s8 {d0[]}, [%[in]]!\n" + "vld1.s8 {d1[]}, [%[in]]!\n" + "vld1.s8 {d2[]}, [%[in]]!\n" + "vld1.s8 {d3[]}, [%[in]]!\n" + + "vld1.s8 {d4-d5}, [%[w]]!\n" + + // K- > r2 + "mov r2, %[K]\n" + + // Computation loop + "2:\n" + + "vmull.s8 q12, d4, d0\n" + "vld1.s8 {d0[]}, [%[in]]!\n" + "vmull.s8 q13, d4, d1\n" + "vld1.s8 {d1[]}, [%[in]]!\n" + "vmull.s8 q14, d4, d2\n" + "vld1.s8 {d2[]}, [%[in]]!\n" + "vmull.s8 q15, d4, d3\n" + "vld1.s8 {d3[]}, [%[in]]!\n" + "vld1.s8 {d4}, [%[w]]!\n" + + "vmlal.s8 q12, d5, d0\n" + "vmlal.s8 q13, d5, d1\n" + "vld1.s8 {d0[]}, [%[in]]!\n" + "vmlal.s8 q14, d5, d2\n" + "vld1.s8 {d1[]}, [%[in]]!\n" + "vmlal.s8 q15, d5, d3\n" + + //"vaddw.s16 q4, q4, d24\n" + //"vaddw.s16 q5, q5, d25\n" + //"vaddw.s16 q6, q6, d26\n" + //"vaddw.s16 q7, q7, d27\n" + //"vaddw.s16 q8, q8, d28\n" + //"vaddw.s16 q9, q9, d29\n" + //"vaddw.s16 q10, q10, d30\n" + //"vaddw.s16 q11, q11, d31\n" + //"vmov.s32 q12, #0\n" + //"vmov.s32 q13, #0\n" + //"vmov.s32 q14, #0\n" + //"vmov.s32 q15, #0\n" + + "vld1.s8 {d2[]}, [%[in]]!\n" + "vmlal.s8 q12, d4, d0\n" + "vld1.s8 {d3[]}, [%[in]]!\n" + "vld1.s8 {d5}, [%[w]]!\n" + "vmlal.s8 q13, d4, d1\n" + "vld1.s8 {d0[]}, [%[in]]!\n" + "vmlal.s8 q14, d4, d2\n" + "vld1.s8 {d1[]}, [%[in]]!\n" + "vmlal.s8 q15, d4, d3\n" + "vld1.s8 {d2[]}, [%[in]]!\n" + + "vmlal.s8 q12, d5, d0\n" + "vld1.s8 {d3[]}, [%[in]]!\n" + "vld1.s8 {d4}, [%[w]]!\n" + "vmlal.s8 q13, d5, d1\n" + "vld1.s8 {d0[]}, [%[in]]!\n" + "vmlal.s8 q14, d5, d2\n" + "vld1.s8 {d1[]}, [%[in]]!\n" + "vmlal.s8 q15, d5, d3\n" + "vld1.s8 {d2[]}, [%[in]]!\n" + "vld1.s8 {d3[]}, [%[in]]!\n" + "vld1.s8 {d5}, [%[w]]!\n" + + "subs r2, r2, #4\n" + + "vaddw.s16 q4, q4, d24\n" + "vaddw.s16 q5, q5, d25\n" + "vaddw.s16 q6, q6, d26\n" + "vaddw.s16 q7, q7, d27\n" + "vaddw.s16 q8, q8, d28\n" + "vaddw.s16 q9, q9, d29\n" + "vaddw.s16 q10, q10, d30\n" + "vaddw.s16 q11, q11, d31\n" + + "bne 2b\n" + : [in] "+r"(in_hw), [w] "+r"(f_o) + : [K] "r"((I64)(ic * fh * fw * 8)), [b0_s] "r"(b0_s), [out_f32] "r"(out_f32_bool) + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15", "r1", "r2"); asm volatile("cmp %[out_f32], #0\n" "beq 4f\n" @@ -317,79 +317,79 @@ EE convolution_gemm_v7(TensorDesc inputDesc, : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r1"); - asm volatile("cmp %[out_f32], #0\n" - "bne 8f\n" - - "4:\n" - "cmp %[conv_relu], #0\n" - "beq 5f\n" - "vmov.s32 q0, #0\n" - "vmaxq.s32 q4, q4, q0\n" - "vmaxq.s32 q5, q5, q0\n" - "vmaxq.s32 q6, q6, q0\n" - "vmaxq.s32 q7, q7, q0\n" - "vmaxq.s32 q8, q8, q0\n" - "vmaxq.s32 q9, q9, q0\n" - "vmaxq.s32 q10, q10, q0\n" - "vmaxq.s32 q11, q11, q0\n" - - "5:\n" - "vld1.s32 {d0-d1}, [%[max_i32]]\n" - "vld1.s32 {d2-d3}, [%[min_i32]]\n" - "cmp %[scale_known], #0\n" - "beq 6f\n" - "vmaxq.s32 q4, q4, q1\n" - "vmaxq.s32 q5, q5, q1\n" - "vmaxq.s32 q6, q6, q1\n" - "vmaxq.s32 q7, q7, q1\n" - "vmaxq.s32 q8, q8, q1\n" - "vmaxq.s32 q9, q9, q1\n" - "vmaxq.s32 q10, q10, q1\n" - "vmaxq.s32 q11, q11, q1\n" - "vminq.s32 q4, q4, q0\n" - "vminq.s32 q5, q5, q0\n" - "vminq.s32 q6, q6, q0\n" - "vminq.s32 q7, q7, q0\n" - "vminq.s32 q8, q8, q0\n" - "vminq.s32 q9, q9, q0\n" - "vminq.s32 q10, q10, q0\n" - "vminq.s32 q11, q11, q0\n" - "b 7f\n" - - "6:\n" - "vmaxq.s32 q0, q4, q0\n" - "vmaxq.s32 q0, q5, q0\n" - "vmaxq.s32 q0, q6, q0\n" - "vmaxq.s32 q0, q7, q0\n" - "vmaxq.s32 q0, q8, q0\n" - "vmaxq.s32 q0, q9, q0\n" - "vmaxq.s32 q0, q10, q0\n" - "vmaxq.s32 q0, q11, q0\n" - "vminq.s32 q1, q4, q1\n" - "vminq.s32 q1, q5, q1\n" - "vminq.s32 q1, q6, q1\n" - "vminq.s32 q1, q7, q1\n" - "vminq.s32 q1, q8, q1\n" - "vminq.s32 q1, q9, q1\n" - "vminq.s32 q1, q10, q1\n" - "vminq.s32 q1, q11, q1\n" - "vst1.s32 {d0-d1}, [%[max_i32]]\n" - "vst1.s32 {d2-d3}, [%[min_i32]]\n" - - "7:\n" - "mov r1, %[out_buf]\n" - "vst1.s32 {d8-d11}, [r1]!\n" - "vst1.s32 {d12-d15}, [r1]!\n" - "vst1.s32 {d16-d19}, [r1]!\n" - "vst1.s32 {d20-d23}, [r1]\n" - - "8:\n" - : [out_buf] "+r"(out_buf) - : [max_i32] "r"(max_i32), [min_i32] "r"(min_i32), - [conv_relu] "r"(conv_relu_bool), [out_f32] "r"(out_f32_bool), - [scale_known] "r"(scale_known_bool) - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", - "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r1"); + asm volatile( + "cmp %[out_f32], #0\n" + "bne 8f\n" + + "4:\n" + "cmp %[conv_relu], #0\n" + "beq 5f\n" + "vmov.s32 q0, #0\n" + "vmaxq.s32 q4, q4, q0\n" + "vmaxq.s32 q5, q5, q0\n" + "vmaxq.s32 q6, q6, q0\n" + "vmaxq.s32 q7, q7, q0\n" + "vmaxq.s32 q8, q8, q0\n" + "vmaxq.s32 q9, q9, q0\n" + "vmaxq.s32 q10, q10, q0\n" + "vmaxq.s32 q11, q11, q0\n" + + "5:\n" + "vld1.s32 {d0-d1}, [%[max_i32]]\n" + "vld1.s32 {d2-d3}, [%[min_i32]]\n" + "cmp %[scale_known], #0\n" + "beq 6f\n" + "vmaxq.s32 q4, q4, q1\n" + "vmaxq.s32 q5, q5, q1\n" + "vmaxq.s32 q6, q6, q1\n" + "vmaxq.s32 q7, q7, q1\n" + "vmaxq.s32 q8, q8, q1\n" + "vmaxq.s32 q9, q9, q1\n" + "vmaxq.s32 q10, q10, q1\n" + "vmaxq.s32 q11, q11, q1\n" + "vminq.s32 q4, q4, q0\n" + "vminq.s32 q5, q5, q0\n" + "vminq.s32 q6, q6, q0\n" + "vminq.s32 q7, q7, q0\n" + "vminq.s32 q8, q8, q0\n" + "vminq.s32 q9, q9, q0\n" + "vminq.s32 q10, q10, q0\n" + "vminq.s32 q11, q11, q0\n" + "b 7f\n" + + "6:\n" + "vmaxq.s32 q0, q4, q0\n" + "vmaxq.s32 q0, q5, q0\n" + "vmaxq.s32 q0, q6, q0\n" + "vmaxq.s32 q0, q7, q0\n" + "vmaxq.s32 q0, q8, q0\n" + "vmaxq.s32 q0, q9, q0\n" + "vmaxq.s32 q0, q10, q0\n" + "vmaxq.s32 q0, q11, q0\n" + "vminq.s32 q1, q4, q1\n" + "vminq.s32 q1, q5, q1\n" + "vminq.s32 q1, q6, q1\n" + "vminq.s32 q1, q7, q1\n" + "vminq.s32 q1, q8, q1\n" + "vminq.s32 q1, q9, q1\n" + "vminq.s32 q1, q10, q1\n" + "vminq.s32 q1, q11, q1\n" + "vst1.s32 {d0-d1}, [%[max_i32]]\n" + "vst1.s32 {d2-d3}, [%[min_i32]]\n" + + "7:\n" + "mov r1, %[out_buf]\n" + "vst1.s32 {d8-d11}, [r1]!\n" + "vst1.s32 {d12-d15}, [r1]!\n" + "vst1.s32 {d16-d19}, [r1]!\n" + "vst1.s32 {d20-d23}, [r1]\n" + + "8:\n" + : [out_buf] "+r"(out_buf) + : [max_i32] "r"(max_i32), [min_i32] "r"(min_i32), [conv_relu] "r"(conv_relu_bool), + [out_f32] "r"(out_f32_bool), [scale_known] "r"(scale_known_bool) + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15", "r1"); #else int32x4_t res[4][2] = {0}; if (out_f32_bool == 0) { diff --git a/compute/tensor/src/cpu/arm/int8/v7/depthwise_pointwise_convolution_direct.cpp b/compute/tensor/src/cpu/arm/int8/v7/depthwise_pointwise_convolution_direct.cpp index 4c81efaa..499d3d2f 100644 --- a/compute/tensor/src/cpu/arm/int8/v7/depthwise_pointwise_convolution_direct.cpp +++ b/compute/tensor/src/cpu/arm/int8/v7/depthwise_pointwise_convolution_direct.cpp @@ -46,10 +46,10 @@ EE depthwise_pointwise_convolution_direct(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 dilateH = convParamSpec.dilatedRate_h; U32 dilateW = convParamSpec.dilatedRate_w; @@ -77,20 +77,20 @@ EE depthwise_pointwise_convolution_direct(TensorDesc inputDesc, INT8 *inArray_mov = inArray + n * ic * ihiw * 8; for (U32 c = 0; c < ic; c++) { if (paddingT > 0) { - memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(idt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(idt)); inArray_pad_mov += paddingT * iw_pad * 8; } for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt)); inArray_pad_mov += paddingL * 8; - memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + UNI_MEMCPY(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); inArray_pad_mov += iw * 8; inArray_mov += iw * 8; - memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt)); inArray_pad_mov += paddingR * 8; } if (paddingB > 0) { - memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(idt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(idt)); inArray_pad_mov += paddingB * iw_pad * 8; } diff --git a/compute/tensor/src/cpu/arm/int8/v8/convolution_gemm.h b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_gemm.h similarity index 100% rename from compute/tensor/src/cpu/arm/int8/v8/convolution_gemm.h rename to compute/tensor/src/cpu/arm/int8/v8.2/convolution_gemm.h diff --git a/compute/tensor/src/cpu/arm/int8/v8/convolution_gemm_A55.cpp b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_gemm_A55.cpp similarity index 94% rename from compute/tensor/src/cpu/arm/int8/v8/convolution_gemm_A55.cpp rename to compute/tensor/src/cpu/arm/int8/v8.2/convolution_gemm_A55.cpp index 16d0a9be..8b622a81 100644 --- a/compute/tensor/src/cpu/arm/int8/v8/convolution_gemm_A55.cpp +++ b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_gemm_A55.cpp @@ -11,8 +11,9 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include "cpu/arm/int8/v8/convolution_gemm.h" +#include "cpu/arm/int8/v8.2/convolution_gemm.h" #include "cpu/arm/transform_functions.h" +#include "cpu/tensor_computing_cpu.h" template EE convolution_gemm_A55(TensorDesc inputDesc, @@ -44,10 +45,10 @@ EE convolution_gemm_A55(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 dilateH = convParamSpec.dilatedRate_h; U32 dilateW = convParamSpec.dilatedRate_w; @@ -84,7 +85,7 @@ EE convolution_gemm_A55(TensorDesc inputDesc, I32 min_i32[4] = {0}; // To record min I32 values for (U32 n = 0; n < in; n++) { // for each batch - F16 scale_i = 1.0; + F32 scale_i = -1.0; // quantize input if necessary if (idt == DT_F16) { @@ -97,35 +98,9 @@ EE convolution_gemm_A55(TensorDesc inputDesc, if (*inputScale > 0) { scale_i = *inputScale; } else { - float16x8_t temp_v = vld1q_f16(in); - float16x8_t max_v = temp_v; - float16x8_t min_v = temp_v; - - for (U32 i = 8; i < numData; i += 8) { - temp_v = vld1q_f16(in + i); - max_v = vmaxq_f16(max_v, temp_v); - min_v = vminq_f16(min_v, temp_v); - } - - F16 max = vmaxvq_f16(max_v); - F16 min = vminvq_f16(min_v); - - if (max == 0 && min == 0) { - return NOT_SUPPORTED; - } - if (max > 0 && min < 0) { - F16 scale_max = 127.0 / max; - F16 scale_min = -127.0 / min; - scale_i = (scale_max < scale_min) ? scale_max : scale_min; - } else if (max < 0) { - scale_i = -127.0 / min; - } else { // min > 0 - scale_i = 127.0 / max; - } - } - for (U32 i = 0; i < numData; i++) { - F32 temp = in[i] * scale_i; - inArray[i] = round_towards_zero(temp, (*inputScale) != scale_i); + TensorDesc tmpDesc = inputDesc; + tmpDesc.dt = DT_I8; + quantize_cpu(inputDesc, in, &tmpDesc, inArray, &scale_i, ARM_A55); } *inputScale = scale_i; } else { @@ -159,7 +134,7 @@ EE convolution_gemm_A55(TensorDesc inputDesc, } } - F32 factor_s = 1.0 / ((F32)scale_i) / ((F32)(*filterScale)); + F32 factor_s = 1.0 / scale_i / ((F32)(*filterScale)); F32 factor_v[4]; for (U32 i = 0; i < 4; i++) { factor_v[i] = factor_s; @@ -767,35 +742,35 @@ EE convolution_gemm_A55(TensorDesc inputDesc, in_pack + c * fh * fw * 8 * 8 + fh_idx * fw * 8 * 4 + fw_idx * 8 * 4; INT8 *in_pack_1 = in_pack_0 + fh * fw * 8 * 4; - __asm__ __volatile__("ldr d0, [%[in_0]]\n" - "ldr x2, [%[in_2]]\n" - "ldr d1, [%[in_1]]\n" - "ldr x3, [%[in_3]]\n" - "ins v0.d[1], x2\n" - "ins v1.d[1], x3\n" - "ldr d4, [%[in_4]]\n" - "ldr x6, [%[in_6]]\n" - "trn1 v20.4s, v0.4s, v1.4s\n" - "trn2 v21.4s, v0.4s, v1.4s\n" - - "ldr d5, [%[in_5]]\n" - "ldr x7, [%[in_7]]\n" - "ins v4.d[1], x6\n" - "ins v5.d[1], x7\n" - - "str q20, [%[pack_0]]\n" - "trn1 v24.4s, v4.4s, v5.4s\n" - "trn2 v25.4s, v4.4s, v5.4s\n" - "str q21, [%[pack_1]]\n" - "str q24, [%[pack_0], #16]\n" - "str q25, [%[pack_1], #16]\n" - : - : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), - [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), - [in_3] "r"(in_3), [in_4] "r"(in_4), [in_5] "r"(in_5), - [in_6] "r"(in_6), [in_7] "r"(in_7) - : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", - "v24", "v25", "x2", "x3", "x6", "x7"); + __asm__ __volatile__( + "ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "ldr d4, [%[in_4]]\n" + "ldr x6, [%[in_6]]\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + + "ldr d5, [%[in_5]]\n" + "ldr x7, [%[in_7]]\n" + "ins v4.d[1], x6\n" + "ins v5.d[1], x7\n" + + "str q20, [%[pack_0]]\n" + "trn1 v24.4s, v4.4s, v5.4s\n" + "trn2 v25.4s, v4.4s, v5.4s\n" + "str q21, [%[pack_1]]\n" + "str q24, [%[pack_0], #16]\n" + "str q25, [%[pack_1], #16]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0), + [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3), [in_4] "r"(in_4), + [in_5] "r"(in_5), [in_6] "r"(in_6), [in_7] "r"(in_7) + : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", "v24", "v25", + "x2", "x3", "x6", "x7"); } } } @@ -1423,8 +1398,8 @@ EE convolution_gemm_A55(TensorDesc inputDesc, INT8 *in_pack_0 = in_pack + c * fh * fw * 8 + fh_idx * fw * 4 + fw_idx * 4; INT8 *in_pack_1 = in_pack_0 + fh * fw * 4; - memcpy(in_pack_0, in_0, 4 * bytesOf(DT_I8)); - memcpy(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8)); + UNI_MEMCPY(in_pack_0, in_0, 4 * bytesOf(DT_I8)); + UNI_MEMCPY(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8)); } } } diff --git a/compute/tensor/src/cpu/arm/int8/v8/convolution_gemm_A76.cpp b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_gemm_A76.cpp similarity index 94% rename from compute/tensor/src/cpu/arm/int8/v8/convolution_gemm_A76.cpp rename to compute/tensor/src/cpu/arm/int8/v8.2/convolution_gemm_A76.cpp index 33926aef..e027ba1c 100644 --- a/compute/tensor/src/cpu/arm/int8/v8/convolution_gemm_A76.cpp +++ b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_gemm_A76.cpp @@ -11,8 +11,9 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include "cpu/arm/int8/v8/convolution_gemm.h" +#include "cpu/arm/int8/v8.2/convolution_gemm.h" #include "cpu/arm/transform_functions.h" +#include "cpu/tensor_computing_cpu.h" template EE convolution_gemm_A76(TensorDesc inputDesc, @@ -44,10 +45,10 @@ EE convolution_gemm_A76(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 dilateH = convParamSpec.dilatedRate_h; U32 dilateW = convParamSpec.dilatedRate_w; @@ -84,7 +85,7 @@ EE convolution_gemm_A76(TensorDesc inputDesc, I32 min_i32[4] = {0}; // To record min I32 values for (U32 n = 0; n < in; n++) { // for each batch - F16 scale_i = 1.0; + F32 scale_i = -1.0; // quantize input if necessary if (idt == DT_F16) { @@ -97,35 +98,9 @@ EE convolution_gemm_A76(TensorDesc inputDesc, if (*inputScale > 0) { scale_i = *inputScale; } else { - float16x8_t temp_v = vld1q_f16(in); - float16x8_t max_v = temp_v; - float16x8_t min_v = temp_v; - - for (U32 i = 8; i < numData; i += 8) { - temp_v = vld1q_f16(in + i); - max_v = vmaxq_f16(max_v, temp_v); - min_v = vminq_f16(min_v, temp_v); - } - - F16 max = vmaxvq_f16(max_v); - F16 min = vminvq_f16(min_v); - - if (max == 0 && min == 0) { - return NOT_SUPPORTED; - } - if (max > 0 && min < 0) { - F16 scale_max = 127.0 / max; - F16 scale_min = -127.0 / min; - scale_i = (scale_max < scale_min) ? scale_max : scale_min; - } else if (max < 0) { - scale_i = -127.0 / min; - } else { // min > 0 - scale_i = 127.0 / max; - } - } - for (U32 i = 0; i < numData; i++) { - F32 temp = in[i] * scale_i; - inArray[i] = round_towards_zero(temp, (*inputScale) != scale_i); + TensorDesc tmpDesc = inputDesc; + tmpDesc.dt = DT_I8; + quantize_cpu(inputDesc, in, &tmpDesc, inArray, &scale_i, ARM_A76); } *inputScale = scale_i; } else { @@ -739,35 +714,35 @@ EE convolution_gemm_A76(TensorDesc inputDesc, in_pack + c * fh * fw * 8 * 8 + fh_idx * fw * 8 * 4 + fw_idx * 8 * 4; INT8 *in_pack_1 = in_pack_0 + fh * fw * 8 * 4; - __asm__ __volatile__("ldr d0, [%[in_0]]\n" - "ldr x2, [%[in_2]]\n" - "ldr d1, [%[in_1]]\n" - "ldr x3, [%[in_3]]\n" - "ins v0.d[1], x2\n" - "ins v1.d[1], x3\n" - "ldr d4, [%[in_4]]\n" - "ldr x6, [%[in_6]]\n" - "trn1 v20.4s, v0.4s, v1.4s\n" - "trn2 v21.4s, v0.4s, v1.4s\n" - - "ldr d5, [%[in_5]]\n" - "ldr x7, [%[in_7]]\n" - "ins v4.d[1], x6\n" - "ins v5.d[1], x7\n" - - "str q20, [%[pack_0]]\n" - "trn1 v24.4s, v4.4s, v5.4s\n" - "trn2 v25.4s, v4.4s, v5.4s\n" - "str q21, [%[pack_1]]\n" - "str q24, [%[pack_0], #16]\n" - "str q25, [%[pack_1], #16]\n" - : - : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), - [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), - [in_3] "r"(in_3), [in_4] "r"(in_4), [in_5] "r"(in_5), - [in_6] "r"(in_6), [in_7] "r"(in_7) - : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", - "v24", "v25", "x2", "x3", "x6", "x7"); + __asm__ __volatile__( + "ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "ldr d4, [%[in_4]]\n" + "ldr x6, [%[in_6]]\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + + "ldr d5, [%[in_5]]\n" + "ldr x7, [%[in_7]]\n" + "ins v4.d[1], x6\n" + "ins v5.d[1], x7\n" + + "str q20, [%[pack_0]]\n" + "trn1 v24.4s, v4.4s, v5.4s\n" + "trn2 v25.4s, v4.4s, v5.4s\n" + "str q21, [%[pack_1]]\n" + "str q24, [%[pack_0], #16]\n" + "str q25, [%[pack_1], #16]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0), + [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3), [in_4] "r"(in_4), + [in_5] "r"(in_5), [in_6] "r"(in_6), [in_7] "r"(in_7) + : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", "v24", "v25", + "x2", "x3", "x6", "x7"); } } } @@ -1358,8 +1333,8 @@ EE convolution_gemm_A76(TensorDesc inputDesc, INT8 *in_pack_0 = in_pack + c * fh * fw * 8 + fh_idx * fw * 4 + fw_idx * 4; INT8 *in_pack_1 = in_pack_0 + fh * fw * 4; - memcpy(in_pack_0, in_0, 4 * bytesOf(DT_I8)); - memcpy(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8)); + UNI_MEMCPY(in_pack_0, in_0, 4 * bytesOf(DT_I8)); + UNI_MEMCPY(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8)); } } } diff --git a/compute/tensor/src/cpu/arm/int8/v8/convolution_winograd.h b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_winograd.h similarity index 100% rename from compute/tensor/src/cpu/arm/int8/v8/convolution_winograd.h rename to compute/tensor/src/cpu/arm/int8/v8.2/convolution_winograd.h diff --git a/compute/tensor/src/cpu/arm/int8/v8/convolution_winograd_A55.cpp b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_winograd_A55.cpp similarity index 94% rename from compute/tensor/src/cpu/arm/int8/v8/convolution_winograd_A55.cpp rename to compute/tensor/src/cpu/arm/int8/v8.2/convolution_winograd_A55.cpp index 4f7ab9ba..83750188 100644 --- a/compute/tensor/src/cpu/arm/int8/v8/convolution_winograd_A55.cpp +++ b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_winograd_A55.cpp @@ -11,8 +11,8 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include "cpu/arm/int8/v8/convolution_winograd_transform.h" -#include "cpu/arm/int8/v8/convolution_winograd.h" +#include "cpu/arm/int8/v8.2/convolution_winograd_transform.h" +#include "cpu/arm/int8/v8.2/convolution_winograd.h" template EE convolution_winograd_A55(TensorDesc inputDesc, @@ -43,10 +43,10 @@ EE convolution_winograd_A55(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; if (fdf != DF_HWNCN8C4) { return NOT_MATCH; @@ -88,6 +88,7 @@ EE convolution_winograd_A55(TensorDesc inputDesc, OT *inArray_pad = (OT *)tmp; short *itmArray = (short *)(inArray_pad + ic * ihiw * 8); // will be cast to fp16 for fp16 inputs F16 *otmArray = (F16 *)(itmArray + 6 * 6 * ic * 12 * 8); + UNI_MEMSET(otmArray, 0, 6 * 6 * 12 * 8 * sizeof(F16)); INT8 *inQ = (INT8 *)(otmArray + 6 * 6 * 12 * 8); if (DT_I8 == odt) { outArray = (F16 *)(inQ + 6 * 6 * ic * 12 * 8); // After otmArray and pack @@ -101,18 +102,18 @@ EE convolution_winograd_A55(TensorDesc inputDesc, OT *inArray_pad_mov = inArray_pad; OT *inArray_mov = inArray + n * ic * ih * iw * 8; for (U32 c = 0; c < ic; c++) { - memset(inArray_pad_mov, 0, pad_top * iw_pad * 8 * bytesOf(idt)); + UNI_MEMSET(inArray_pad_mov, 0, pad_top * iw_pad * 8 * bytesOf(idt)); inArray_pad_mov += pad_top * iw_pad * 8; for (U32 h = pad_top; h < ih_pad - pad_bottom; h++) { - memset(inArray_pad_mov, 0, pad_left * 8 * bytesOf(idt)); + UNI_MEMSET(inArray_pad_mov, 0, pad_left * 8 * bytesOf(idt)); inArray_pad_mov += pad_left * 8; - memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + UNI_MEMCPY(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); inArray_pad_mov += iw * 8; inArray_mov += iw * 8; - memset(inArray_pad_mov, 0, pad_right * 8 * bytesOf(idt)); + UNI_MEMSET(inArray_pad_mov, 0, pad_right * 8 * bytesOf(idt)); inArray_pad_mov += pad_right * 8; } - memset(inArray_pad_mov, 0, pad_bottom * iw_pad * 8 * bytesOf(idt)); + UNI_MEMSET(inArray_pad_mov, 0, pad_bottom * iw_pad * 8 * bytesOf(idt)); inArray_pad_mov += pad_bottom * iw_pad * 8; } @@ -405,7 +406,7 @@ EE convolution_winograd_A55(TensorDesc inputDesc, INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; F16 *out_o0hw0 = otmArray + idx * 12 * 8; if (factor_v[idx][0] == 0) { // input pixels are all 0 - memset(out_o0hw0, 0, 12 * 8 * sizeof(OT)); + UNI_MEMSET(out_o0hw0, 0, 12 * 8 * sizeof(OT)); continue; } F32 *fac = factor_v[idx]; @@ -806,35 +807,35 @@ EE convolution_winograd_A55(TensorDesc inputDesc, INT8 *in_pack_0 = in_pack + idx * 8 * ic * 8 + c * 8 * 8; INT8 *in_pack_1 = in_pack_0 + 8 * 4; - __asm__ __volatile__("ldr d0, [%[in_0]]\n" - "ldr x2, [%[in_2]]\n" - "ldr d1, [%[in_1]]\n" - "ldr x3, [%[in_3]]\n" - "ins v0.d[1], x2\n" - "ins v1.d[1], x3\n" - "ldr d4, [%[in_4]]\n" - "ldr x6, [%[in_6]]\n" - "trn1 v20.4s, v0.4s, v1.4s\n" - "trn2 v21.4s, v0.4s, v1.4s\n" - - "ldr d5, [%[in_5]]\n" - "ldr x7, [%[in_7]]\n" - "ins v4.d[1], x6\n" - "ins v5.d[1], x7\n" - - "str q20, [%[pack_0]]\n" - "trn1 v24.4s, v4.4s, v5.4s\n" - "trn2 v25.4s, v4.4s, v5.4s\n" - "str q21, [%[pack_1]]\n" - "str q24, [%[pack_0], #16]\n" - "str q25, [%[pack_1], #16]\n" - : - : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), - [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), - [in_3] "r"(in_3), [in_4] "r"(in_4), [in_5] "r"(in_5), - [in_6] "r"(in_6), [in_7] "r"(in_7) - : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", - "v24", "v25", "x2", "x3", "x6", "x7"); + __asm__ __volatile__( + "ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "ldr d4, [%[in_4]]\n" + "ldr x6, [%[in_6]]\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + + "ldr d5, [%[in_5]]\n" + "ldr x7, [%[in_7]]\n" + "ins v4.d[1], x6\n" + "ins v5.d[1], x7\n" + + "str q20, [%[pack_0]]\n" + "trn1 v24.4s, v4.4s, v5.4s\n" + "trn2 v25.4s, v4.4s, v5.4s\n" + "str q21, [%[pack_1]]\n" + "str q24, [%[pack_0], #16]\n" + "str q25, [%[pack_1], #16]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0), + [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3), [in_4] "r"(in_4), + [in_5] "r"(in_5), [in_6] "r"(in_6), [in_7] "r"(in_7) + : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", "v24", "v25", "x2", + "x3", "x6", "x7"); } } @@ -847,7 +848,7 @@ EE convolution_winograd_A55(TensorDesc inputDesc, INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; F16 *out_o0hw0 = otmArray + idx * 8 * 8; if (factor_v[idx][0] == 0) { // input pixels are all 0 - memset(out_o0hw0, 0, 8 * 8 * sizeof(OT)); + UNI_MEMSET(out_o0hw0, 0, 8 * 8 * sizeof(OT)); continue; } F32 *fac = factor_v[idx]; @@ -1133,21 +1134,21 @@ EE convolution_winograd_A55(TensorDesc inputDesc, INT8 *in_pack_0 = in_pack + idx * 4 * ic * 8 + c * 4 * 8; INT8 *in_pack_1 = in_pack_0 + 4 * 4; - __asm__ __volatile__("ldr d0, [%[in_0]]\n" - "ldr x2, [%[in_2]]\n" - "ldr d1, [%[in_1]]\n" - "ldr x3, [%[in_3]]\n" - "ins v0.d[1], x2\n" - "ins v1.d[1], x3\n" - "trn1 v20.4s, v0.4s, v1.4s\n" - "trn2 v21.4s, v0.4s, v1.4s\n" - "str q20, [%[pack_0]]\n" - "str q21, [%[pack_1]]\n" - : - : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), - [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), - [in_3] "r"(in_3) - : "memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3"); + __asm__ __volatile__( + "ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + "str q20, [%[pack_0]]\n" + "str q21, [%[pack_1]]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0), + [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3) + : "memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3"); } } @@ -1160,7 +1161,7 @@ EE convolution_winograd_A55(TensorDesc inputDesc, INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; F16 *out_o0hw0 = otmArray + idx * 4 * 8; if (factor_v[idx][0] == 0) { - memset(out_o0hw0, 0, 4 * 8 * sizeof(OT)); + UNI_MEMSET(out_o0hw0, 0, 4 * 8 * sizeof(OT)); continue; } F32 *fac = factor_v[idx]; @@ -1349,8 +1350,8 @@ EE convolution_winograd_A55(TensorDesc inputDesc, INT8 *in_pack_0 = in_pack + idx * ic * 8 + c * 8; INT8 *in_pack_1 = in_pack_0 + 4; - memcpy(in_pack_0, in_0, 4 * bytesOf(DT_I8)); - memcpy(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8)); + UNI_MEMCPY(in_pack_0, in_0, 4 * bytesOf(DT_I8)); + UNI_MEMCPY(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8)); } } @@ -1363,7 +1364,7 @@ EE convolution_winograd_A55(TensorDesc inputDesc, INT8 *f_o = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; F16 *out_o0hw0 = otmArray + idx * 8; if (factor_v[idx][0] == 0) { - memset(out_o0hw0, 0, 8 * sizeof(OT)); + UNI_MEMSET(out_o0hw0, 0, 8 * sizeof(OT)); continue; } int32x4_t res[2] = {0}; diff --git a/compute/tensor/src/cpu/arm/int8/v8/convolution_winograd_A76.cpp b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_winograd_A76.cpp similarity index 94% rename from compute/tensor/src/cpu/arm/int8/v8/convolution_winograd_A76.cpp rename to compute/tensor/src/cpu/arm/int8/v8.2/convolution_winograd_A76.cpp index 168a4ab6..ca160982 100644 --- a/compute/tensor/src/cpu/arm/int8/v8/convolution_winograd_A76.cpp +++ b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_winograd_A76.cpp @@ -11,8 +11,8 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include "cpu/arm/int8/v8/convolution_winograd_transform.h" -#include "cpu/arm/int8/v8/convolution_winograd.h" +#include "cpu/arm/int8/v8.2/convolution_winograd_transform.h" +#include "cpu/arm/int8/v8.2/convolution_winograd.h" template EE convolution_winograd_A76(TensorDesc inputDesc, @@ -43,10 +43,10 @@ EE convolution_winograd_A76(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; if (fdf != DF_HWNCN8C4) { return NOT_MATCH; @@ -88,6 +88,7 @@ EE convolution_winograd_A76(TensorDesc inputDesc, OT *inArray_pad = (OT *)tmp; short *itmArray = (short *)(inArray_pad + ic * ihiw * 8); // will be cast to fp16 for fp16 inputs F16 *otmArray = (F16 *)(itmArray + 6 * 6 * ic * 12 * 8); + UNI_MEMSET(otmArray, 0, 6 * 6 * 12 * 8 * sizeof(F16)); INT8 *inQ = (INT8 *)(otmArray + 6 * 6 * 12 * 8); if (DT_I8 == odt) { outArray = (F16 *)(inQ + 6 * 6 * ic * 12 * 8); // After otmArray and pack @@ -101,18 +102,18 @@ EE convolution_winograd_A76(TensorDesc inputDesc, OT *inArray_pad_mov = inArray_pad; OT *inArray_mov = inArray + n * ic * ih * iw * 8; for (U32 c = 0; c < ic; c++) { - memset(inArray_pad_mov, 0, pad_top * iw_pad * 8 * bytesOf(idt)); + UNI_MEMSET(inArray_pad_mov, 0, pad_top * iw_pad * 8 * bytesOf(idt)); inArray_pad_mov += pad_top * iw_pad * 8; for (U32 h = pad_top; h < ih_pad - pad_bottom; h++) { - memset(inArray_pad_mov, 0, pad_left * 8 * bytesOf(idt)); + UNI_MEMSET(inArray_pad_mov, 0, pad_left * 8 * bytesOf(idt)); inArray_pad_mov += pad_left * 8; - memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + UNI_MEMCPY(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); inArray_pad_mov += iw * 8; inArray_mov += iw * 8; - memset(inArray_pad_mov, 0, pad_right * 8 * bytesOf(idt)); + UNI_MEMSET(inArray_pad_mov, 0, pad_right * 8 * bytesOf(idt)); inArray_pad_mov += pad_right * 8; } - memset(inArray_pad_mov, 0, pad_bottom * iw_pad * 8 * bytesOf(idt)); + UNI_MEMSET(inArray_pad_mov, 0, pad_bottom * iw_pad * 8 * bytesOf(idt)); inArray_pad_mov += pad_bottom * iw_pad * 8; } @@ -405,7 +406,7 @@ EE convolution_winograd_A76(TensorDesc inputDesc, INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; F16 *out_o0hw0 = otmArray + idx * 12 * 8; if (factor_v[idx][0] == 0) { // input pixels are all 0 - memset(out_o0hw0, 0, 12 * 8 * sizeof(OT)); + UNI_MEMSET(out_o0hw0, 0, 12 * 8 * sizeof(OT)); continue; } F32 *fac = factor_v[idx]; @@ -787,35 +788,35 @@ EE convolution_winograd_A76(TensorDesc inputDesc, INT8 *in_pack_0 = in_pack + idx * 8 * ic * 8 + c * 8 * 8; INT8 *in_pack_1 = in_pack_0 + 8 * 4; - __asm__ __volatile__("ldr d0, [%[in_0]]\n" - "ldr x2, [%[in_2]]\n" - "ldr d1, [%[in_1]]\n" - "ldr x3, [%[in_3]]\n" - "ins v0.d[1], x2\n" - "ins v1.d[1], x3\n" - "ldr d4, [%[in_4]]\n" - "ldr x6, [%[in_6]]\n" - "trn1 v20.4s, v0.4s, v1.4s\n" - "trn2 v21.4s, v0.4s, v1.4s\n" - - "ldr d5, [%[in_5]]\n" - "ldr x7, [%[in_7]]\n" - "ins v4.d[1], x6\n" - "ins v5.d[1], x7\n" - - "str q20, [%[pack_0]]\n" - "trn1 v24.4s, v4.4s, v5.4s\n" - "trn2 v25.4s, v4.4s, v5.4s\n" - "str q21, [%[pack_1]]\n" - "str q24, [%[pack_0], #16]\n" - "str q25, [%[pack_1], #16]\n" - : - : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), - [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), - [in_3] "r"(in_3), [in_4] "r"(in_4), [in_5] "r"(in_5), - [in_6] "r"(in_6), [in_7] "r"(in_7) - : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", - "v24", "v25", "x2", "x3", "x6", "x7"); + __asm__ __volatile__( + "ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "ldr d4, [%[in_4]]\n" + "ldr x6, [%[in_6]]\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + + "ldr d5, [%[in_5]]\n" + "ldr x7, [%[in_7]]\n" + "ins v4.d[1], x6\n" + "ins v5.d[1], x7\n" + + "str q20, [%[pack_0]]\n" + "trn1 v24.4s, v4.4s, v5.4s\n" + "trn2 v25.4s, v4.4s, v5.4s\n" + "str q21, [%[pack_1]]\n" + "str q24, [%[pack_0], #16]\n" + "str q25, [%[pack_1], #16]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0), + [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3), [in_4] "r"(in_4), + [in_5] "r"(in_5), [in_6] "r"(in_6), [in_7] "r"(in_7) + : "memory", "cc", "v0", "v1", "v4", "v5", "v20", "v21", "v24", "v25", "x2", + "x3", "x6", "x7"); } } @@ -828,7 +829,7 @@ EE convolution_winograd_A76(TensorDesc inputDesc, INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; F16 *out_o0hw0 = otmArray + idx * 8 * 8; if (factor_v[idx][0] == 0) { // input pixels are all 0 - memset(out_o0hw0, 0, 8 * 8 * sizeof(OT)); + UNI_MEMSET(out_o0hw0, 0, 8 * 8 * sizeof(OT)); continue; } F32 *fac = factor_v[idx]; @@ -1099,21 +1100,21 @@ EE convolution_winograd_A76(TensorDesc inputDesc, INT8 *in_pack_0 = in_pack + idx * 4 * ic * 8 + c * 4 * 8; INT8 *in_pack_1 = in_pack_0 + 4 * 4; - __asm__ __volatile__("ldr d0, [%[in_0]]\n" - "ldr x2, [%[in_2]]\n" - "ldr d1, [%[in_1]]\n" - "ldr x3, [%[in_3]]\n" - "ins v0.d[1], x2\n" - "ins v1.d[1], x3\n" - "trn1 v20.4s, v0.4s, v1.4s\n" - "trn2 v21.4s, v0.4s, v1.4s\n" - "str q20, [%[pack_0]]\n" - "str q21, [%[pack_1]]\n" - : - : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), - [in_0] "r"(in_0), [in_1] "r"(in_1), [in_2] "r"(in_2), - [in_3] "r"(in_3) - : "memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3"); + __asm__ __volatile__( + "ldr d0, [%[in_0]]\n" + "ldr x2, [%[in_2]]\n" + "ldr d1, [%[in_1]]\n" + "ldr x3, [%[in_3]]\n" + "ins v0.d[1], x2\n" + "ins v1.d[1], x3\n" + "trn1 v20.4s, v0.4s, v1.4s\n" + "trn2 v21.4s, v0.4s, v1.4s\n" + "str q20, [%[pack_0]]\n" + "str q21, [%[pack_1]]\n" + : + : [pack_0] "r"(in_pack_0), [pack_1] "r"(in_pack_1), [in_0] "r"(in_0), + [in_1] "r"(in_1), [in_2] "r"(in_2), [in_3] "r"(in_3) + : "memory", "cc", "v0", "v1", "v20", "v21", "x2", "x3"); } } @@ -1126,7 +1127,7 @@ EE convolution_winograd_A76(TensorDesc inputDesc, INT8 *f_o0c0 = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; F16 *out_o0hw0 = otmArray + idx * 4 * 8; if (factor_v[idx][0] == 0) { - memset(out_o0hw0, 0, 4 * 8 * sizeof(OT)); + UNI_MEMSET(out_o0hw0, 0, 4 * 8 * sizeof(OT)); continue; } F32 *fac = factor_v[idx]; @@ -1302,8 +1303,8 @@ EE convolution_winograd_A76(TensorDesc inputDesc, INT8 *in_pack_0 = in_pack + idx * ic * 8 + c * 8; INT8 *in_pack_1 = in_pack_0 + 4; - memcpy(in_pack_0, in_0, 4 * bytesOf(DT_I8)); - memcpy(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8)); + UNI_MEMCPY(in_pack_0, in_0, 4 * bytesOf(DT_I8)); + UNI_MEMCPY(in_pack_1, in_0 + 4, 4 * bytesOf(DT_I8)); } } @@ -1316,7 +1317,7 @@ EE convolution_winograd_A76(TensorDesc inputDesc, INT8 *f_o = filterArray + o * 8 * 36 * ic * 8 + idx * 8 * ic * 8; F16 *out_o0hw0 = otmArray + idx * 8; if (factor_v[idx][0] == 0) { - memset(out_o0hw0, 0, 8 * sizeof(OT)); + UNI_MEMSET(out_o0hw0, 0, 8 * sizeof(OT)); continue; } int32x4_t res[2] = {0}; diff --git a/compute/tensor/src/cpu/arm/int8/v8/convolution_winograd_transform.h b/compute/tensor/src/cpu/arm/int8/v8.2/convolution_winograd_transform.h similarity index 100% rename from compute/tensor/src/cpu/arm/int8/v8/convolution_winograd_transform.h rename to compute/tensor/src/cpu/arm/int8/v8.2/convolution_winograd_transform.h diff --git a/compute/tensor/src/cpu/arm/int8/v8/depthwise_pointwise_convolution_direct.cpp b/compute/tensor/src/cpu/arm/int8/v8.2/depthwise_pointwise_convolution_direct.cpp similarity index 99% rename from compute/tensor/src/cpu/arm/int8/v8/depthwise_pointwise_convolution_direct.cpp rename to compute/tensor/src/cpu/arm/int8/v8.2/depthwise_pointwise_convolution_direct.cpp index 77ec8489..a96d8cf3 100644 --- a/compute/tensor/src/cpu/arm/int8/v8/depthwise_pointwise_convolution_direct.cpp +++ b/compute/tensor/src/cpu/arm/int8/v8.2/depthwise_pointwise_convolution_direct.cpp @@ -45,10 +45,10 @@ EE depthwise_pointwise_convolution_direct(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 dilateH = convParamSpec.dilatedRate_h; U32 dilateW = convParamSpec.dilatedRate_w; @@ -76,20 +76,20 @@ EE depthwise_pointwise_convolution_direct(TensorDesc inputDesc, INT8 *inArray_mov = inArray + n * ic * ihiw * 8; for (U32 c = 0; c < ic; c++) { if (paddingT > 0) { - memset(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(idt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingT * iw_pad * 8 * bytesOf(idt)); inArray_pad_mov += paddingT * iw_pad * 8; } for (U32 h = paddingT; h < ih_pad - paddingB; h++) { - memset(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingL * 8 * bytesOf(idt)); inArray_pad_mov += paddingL * 8; - memcpy(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); + UNI_MEMCPY(inArray_pad_mov, inArray_mov, iw * 8 * bytesOf(idt)); inArray_pad_mov += iw * 8; inArray_mov += iw * 8; - memset(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingR * 8 * bytesOf(idt)); inArray_pad_mov += paddingR * 8; } if (paddingB > 0) { - memset(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(idt)); + UNI_MEMSET(inArray_pad_mov, 0, paddingB * iw_pad * 8 * bytesOf(idt)); inArray_pad_mov += paddingB * iw_pad * 8; } diff --git a/compute/tensor/src/cpu/arm/normalization.cpp b/compute/tensor/src/cpu/arm/normalization.cpp index a26d8bc3..33c01e5f 100644 --- a/compute/tensor/src/cpu/arm/normalization.cpp +++ b/compute/tensor/src/cpu/arm/normalization.cpp @@ -19,28 +19,31 @@ #include "cpu/arm/fp16/tensor_computing_fp16.h" #endif -EE layer_normalization_arm( - TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output) +EE layer_normalization_arm(TensorDesc inputDesc, + void *input, + LayerNormParamSpec p, + void *alpha, + void *beta, + TensorDesc outputDesc, + void *output) { - DataType idt = inputDesc.dt; - EE ret = SUCCESS; - switch (idt) { + EE ret = NOT_SUPPORTED; + switch (inputDesc.dt) { #ifdef _USE_FP32 case DT_F32: { ret = layer_normalization_fp32( - inputDesc, (F32 *)input, (F32 *)alpha, (F32 *)beta, outputDesc, (F32 *)output); + inputDesc, (F32 *)input, p, (F32 *)alpha, (F32 *)beta, outputDesc, (F32 *)output); break; } #endif #ifdef _USE_FP16 case DT_F16: { ret = layer_normalization_fp16( - inputDesc, (F16 *)input, (F16 *)alpha, (F16 *)beta, outputDesc, (F16 *)output); + inputDesc, (F16 *)input, p, (F16 *)alpha, (F16 *)beta, outputDesc, (F16 *)output); break; } #endif default: - ret = NOT_SUPPORTED; break; } return ret; diff --git a/compute/tensor/src/cpu/arm/padding.cpp b/compute/tensor/src/cpu/arm/padding.cpp index da462d8e..091a6355 100644 --- a/compute/tensor/src/cpu/arm/padding.cpp +++ b/compute/tensor/src/cpu/arm/padding.cpp @@ -12,7 +12,6 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "cpu/arm/tensor_computing_arm.h" -#include EE padding_arm(TensorDesc inputDesc, const void *input, @@ -40,33 +39,33 @@ EE padding_arm(TensorDesc inputDesc, (const U8 *)input + (((n * ic + c) * ih + h) * iw) * alignSize * bytesOf(idt); U8 *outPtr = (U8 *)output + (((n * oc + c) * oh + (padParamSpec.top + h)) * ow) * alignSize * bytesOf(odt); - if (padParamSpec.pad_mode == Pad_Constant) { - memset(outPtr, 0, padParamSpec.left * alignSize * bytesOf(odt)); + if (padParamSpec.pad_mode == PAD_CONSTANT) { + UNI_MEMSET(outPtr, 0, padParamSpec.left * alignSize * bytesOf(odt)); outPtr += padParamSpec.left * alignSize * bytesOf(odt); - memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); + UNI_MEMCPY(outPtr, inPtr, iw * alignSize * bytesOf(idt)); outPtr += iw * alignSize * bytesOf(odt); - memset(outPtr, 0, padParamSpec.right * alignSize * bytesOf(odt)); + UNI_MEMSET(outPtr, 0, padParamSpec.right * alignSize * bytesOf(odt)); } else { for (U32 w = 0; w < padParamSpec.left; w++) { U32 index = 0; - if (padParamSpec.pad_mode == Pad_Reflect) { + if (padParamSpec.pad_mode == PAD_REFLECT) { index = (padParamSpec.left - w) * alignSize * bytesOf(idt); - } else if (padParamSpec.pad_mode == Pad_Symmetric) { + } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) { index = (padParamSpec.left - w - 1) * alignSize * bytesOf(idt); } - memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt)); + UNI_MEMCPY(outPtr, inPtr + index, alignSize * bytesOf(idt)); outPtr += alignSize * bytesOf(idt); } - memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); + UNI_MEMCPY(outPtr, inPtr, iw * alignSize * bytesOf(idt)); outPtr += iw * alignSize * bytesOf(odt); for (U32 w = 0; w < padParamSpec.right; w++) { U32 index = (iw - 1) * alignSize * bytesOf(idt); - if (padParamSpec.pad_mode == Pad_Reflect) { + if (padParamSpec.pad_mode == PAD_REFLECT) { index = (iw - w - 2) * alignSize * bytesOf(idt); - } else if (padParamSpec.pad_mode == Pad_Symmetric) { + } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) { index = (iw - w - 1) * alignSize * bytesOf(idt); } - memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt)); + UNI_MEMCPY(outPtr, inPtr + index, alignSize * bytesOf(idt)); outPtr += alignSize * bytesOf(idt); } } @@ -74,20 +73,20 @@ EE padding_arm(TensorDesc inputDesc, U8 *outPtr = (U8 *)output + (((n * oc + c) * oh) * ow) * alignSize * bytesOf(odt); for (U32 h = 0; h < padParamSpec.top; h++) { U32 index = h * ow * alignSize * bytesOf(odt); - if (padParamSpec.pad_mode == Pad_Constant) { - memset(outPtr + index, 0, ow * alignSize * bytesOf(odt)); - } else if (padParamSpec.pad_mode == Pad_Edge) { - memcpy(outPtr + index, + if (padParamSpec.pad_mode == PAD_CONSTANT) { + UNI_MEMSET(outPtr + index, 0, ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == PAD_EDGE) { + UNI_MEMCPY(outPtr + index, outPtr + (padParamSpec.top * ow * alignSize * bytesOf(odt)), ow * alignSize * bytesOf(odt)); - } else if (padParamSpec.pad_mode == Pad_Reflect) { - memcpy(outPtr + index, + } else if (padParamSpec.pad_mode == PAD_REFLECT) { + UNI_MEMCPY(outPtr + index, outPtr + ((padParamSpec.top + padParamSpec.top - h) * ow * alignSize * bytesOf(odt)), ow * alignSize * bytesOf(odt)); - } else if (padParamSpec.pad_mode == Pad_Symmetric) { - memcpy(outPtr + index, + } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) { + UNI_MEMCPY(outPtr + index, outPtr + ((padParamSpec.top + padParamSpec.top - h - 1) * ow * alignSize * bytesOf(odt)), @@ -98,21 +97,21 @@ EE padding_arm(TensorDesc inputDesc, } for (U32 h = 0; h < padParamSpec.bottom; h++) { U32 index = (padParamSpec.top + ih + h) * ow * alignSize * bytesOf(odt); - if (padParamSpec.pad_mode == Pad_Constant) { - memset(outPtr + index, 0, ow * alignSize * bytesOf(odt)); - } else if (padParamSpec.pad_mode == Pad_Edge) { - memcpy(outPtr + index, + if (padParamSpec.pad_mode == PAD_CONSTANT) { + UNI_MEMSET(outPtr + index, 0, ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == PAD_EDGE) { + UNI_MEMCPY(outPtr + index, outPtr + ((padParamSpec.top + ih - 1) * ow * alignSize * bytesOf(odt)), ow * alignSize * bytesOf(odt)); - } else if (padParamSpec.pad_mode == Pad_Reflect) { - // memcpy(outPtr+index, outPtr+((padParamSpec.top+ih-2-h)*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt)); - memcpy(outPtr + index, + } else if (padParamSpec.pad_mode == PAD_REFLECT) { + // UNI_MEMCPY(outPtr+index, outPtr+((padParamSpec.top+ih-2-h)*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt)); + UNI_MEMCPY(outPtr + index, outPtr + ((padParamSpec.top + ih - 1 - padParamSpec.bottom + h) * ow * alignSize * bytesOf(odt)), ow * alignSize * bytesOf(odt)); - } else if (padParamSpec.pad_mode == Pad_Symmetric) { - memcpy(outPtr + index, + } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) { + UNI_MEMCPY(outPtr + index, outPtr + ((padParamSpec.top + ih - 1 - h) * ow * alignSize * bytesOf(odt)), ow * alignSize * bytesOf(odt)); } else { diff --git a/compute/tensor/src/cpu/arm/pooling.cpp b/compute/tensor/src/cpu/arm/pooling.cpp index d0f0586f..39c1d25f 100644 --- a/compute/tensor/src/cpu/arm/pooling.cpp +++ b/compute/tensor/src/cpu/arm/pooling.cpp @@ -54,7 +54,7 @@ EE pooling_arm(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); it = ot = 1; - p.padding_before = p.padding_after = 0; + p.pad_before = p.pad_after = 0; p.kernel_t = p.stride_t = 1; } else if (tensorIs5d(inputDesc)) { CHECK_STATUS(tensor5dGet(inputDesc, &idt, &idf, &in, &ic, &it, &ih, &iw)); @@ -72,13 +72,11 @@ EE pooling_arm(TensorDesc inputDesc, if (idf != DF_NCHWC8 || odf != idf) { ret = NOT_MATCH; } - if (p.padding_before >= p.kernel_t || p.padding_top >= p.kernel_h || - p.padding_left >= p.kernel_w) { + if (p.pad_before >= p.kernel_t || p.pad_top >= p.kernel_h || p.pad_left >= p.kernel_w) { return NOT_SUPPORTED; } ic /= 8; - int kernelSize = p.kernel_t * p.kernel_h * p.kernel_w; ArmPoolingFunction func = nullptr; if (p.mode == POOLING_MAX) { switch (idt) { @@ -124,29 +122,40 @@ EE pooling_arm(TensorDesc inputDesc, return NOT_SUPPORTED; } - const U8 *inputPtr = (const U8 *)input; - U8 *outputPtr = (U8 *)output; - for (U32 n = 0; n < in; n++) { - for (U32 c = 0; c < ic; c++) { +#ifdef _USE_OPENMP +#pragma omp parallel num_threads(OMP_NUM_THREADS) +#endif + { + int kernelSize = p.kernel_t * p.kernel_h * p.kernel_w; +#ifdef _USE_OPENMP +#pragma omp for +#endif + for (U32 o = 0; o < in * ic; o++) { + U32 n = o / ic; + U32 c = o % ic; + const U8 *src = (const U8 *)input + o * it * ih * iw * 8 * bytesOf(idt); + U8 *dst = (U8 *)output + o * ot * oh * ow * 8 * bytesOf(idt); for (U32 t = 0; t < ot; t++) { - int tstart = t * (int)p.stride_t - (int)p.padding_before; + int tstart = t * (int)p.stride_t - (int)p.pad_before; int tend = UNI_MIN(tstart + p.kernel_t, it); tstart = UNI_MAX(tstart, 0); for (U32 h = 0; h < oh; h++) { - int hstart = h * (int)p.stride_h - (int)p.padding_top; + int hstart = h * (int)p.stride_h - (int)p.pad_top; int hend = UNI_MIN(hstart + p.kernel_h, ih); hstart = UNI_MAX(hstart, 0); - for (U32 w = 0; w < ow; w++, outputPtr += 8 * bytesOf(odt)) { - int wstart = w * (int)p.stride_w - (int)p.padding_left; + for (U32 w = 0; w < ow; w++, dst += 8 * bytesOf(idt)) { + int wstart = w * (int)p.stride_w - (int)p.pad_left; int wend = UNI_MIN(wstart + p.kernel_w, iw); wstart = UNI_MAX(wstart, 0); - int poolSize = (tend - tstart) * (hend - hstart) * (wend - wstart); + int poolSize = kernelSize; + if (!p.count_include_pad) { + poolSize = (tend - tstart) * (hend - hstart) * (wend - wstart); + } ret = func(tstart, tend, hstart, hend, wstart, wend, kernelSize, poolSize, - inputPtr, it, ih, iw, outputPtr, scale); + src, it, ih, iw, dst, scale); } } } - inputPtr += it * ih * iw * 8 * bytesOf(idt); } } return ret; @@ -174,37 +183,47 @@ EE pooling_bp_arm( if (idf != DF_NCHWC8 || odf != idf) { ret = NOT_MATCH; } - if (p.padding_top >= p.kernel_h || p.padding_left >= p.kernel_w) { + if (p.pad_top >= p.kernel_h || p.pad_left >= p.kernel_w) { ret = NOT_SUPPORTED; } ic /= 8; - const U8 *inputPtr = (const U8 *)input; - U8 *outputPtr = (U8 *)output; - for (U32 n = 0; n < in; n++) { - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < ih; h++) { - for (U32 w = 0; w < iw; w++, inputPtr += 8 * bytesOf(idt)) { - int hstart = (int)h * (int)p.stride_h - (int)p.padding_top; - int wstart = (int)w * (int)p.stride_w - (int)p.padding_left; - int hend = UNI_MIN(hstart + p.kernel_h, oh); - int wend = UNI_MIN(wstart + p.kernel_w, ow); - hstart = UNI_MAX(hstart, 0); - wstart = UNI_MAX(wstart, 0); - switch (idt) { + const U8 *src = (const U8 *)input; + U8 *dst = (U8 *)output; + int poolSize = p.kernel_t * p.kernel_h * p.kernel_w; +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 o = 0; o < in * ic; o++) { + U32 n = o / ic; + U32 c = o % ic; + //for (U32 n = 0; n < in; n++) { + // for (U32 c = 0; c < ic; c++) { + const U8 *src = (const U8 *)input + o * ih * iw * 8 * bytesOf(idt); + U8 *dst = (U8 *)output + o * oh * ow * 8 * bytesOf(idt); + for (U32 h = 0; h < ih; h++) { + for (U32 w = 0; w < iw; w++, src += 8 * bytesOf(idt)) { + int hstart = (int)h * (int)p.stride_h - (int)p.pad_top; + int wstart = (int)w * (int)p.stride_w - (int)p.pad_left; + int hend = UNI_MIN(hstart + p.kernel_h, oh); + int wend = UNI_MIN(wstart + p.kernel_w, ow); + hstart = UNI_MAX(hstart, 0); + wstart = UNI_MAX(wstart, 0); + if (!p.count_include_pad) { + poolSize = (hend - hstart) * (wend - wstart); + } + switch (idt) { #ifdef _USE_FP32 - case DT_F32: - ret = pooling_bp_c8_fp32((const F32 *)inputPtr, hstart, hend, wstart, - wend, (F32 *)outputPtr, ow, p); - break; + case DT_F32: + ret = pooling_bp_c8_fp32((const F32 *)src, hstart, hend, wstart, wend, + poolSize, (F32 *)dst, ow, p); + break; #endif - default: - ret = NOT_SUPPORTED; - break; - } + default: + ret = NOT_SUPPORTED; + break; } } - outputPtr += oh * ow * 8 * bytesOf(odt); } } return ret; diff --git a/compute/tensor/src/cpu/arm/scale.cpp b/compute/tensor/src/cpu/arm/scale.cpp index 405ae42c..4e44f033 100644 --- a/compute/tensor/src/cpu/arm/scale.cpp +++ b/compute/tensor/src/cpu/arm/scale.cpp @@ -12,6 +12,7 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "cpu/arm/tensor_computing_arm.h" +#include "cpu/arm/int32/tensor_computing_int32.h" #ifdef _USE_FP32 #include "cpu/arm/fp32/tensor_computing_fp32.h" #endif @@ -36,7 +37,7 @@ EE scale_arm(TensorDesc inputDesc, if (outputDesc.df == DF_NCHWC8) { axis = outputDesc.nDims; } - EE ret = SUCCESS; + EE ret = NOT_SUPPORTED; switch (outputDesc.dt) { #ifdef _USE_FP32 case DT_F32: { @@ -52,8 +53,12 @@ EE scale_arm(TensorDesc inputDesc, break; } #endif + case DT_I32: { + ret = scale_int32((I32 *)input, axis, outputDesc.nDims, (I32 *)alpha, (I32 *)beta, on, + oc, elements_per_channel, ic, (I32 *)output); + break; + } default: - ret = NOT_SUPPORTED; break; } diff --git a/compute/tensor/src/cpu/arm/softmax.cpp b/compute/tensor/src/cpu/arm/softmax.cpp index 88ebb474..df50e0d6 100644 --- a/compute/tensor/src/cpu/arm/softmax.cpp +++ b/compute/tensor/src/cpu/arm/softmax.cpp @@ -22,9 +22,8 @@ EE softmax_arm( TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output) { - DataType idt = inputDesc.dt; - EE ret = SUCCESS; - switch (idt) { + EE ret = NOT_SUPPORTED; + switch (inputDesc.dt) { #ifdef _USE_FP32 case DT_F32: { ret = softmax_fp32(inputDesc, (const F32 *)input, p.axis, outputDesc, (F32 *)output); @@ -38,9 +37,30 @@ EE softmax_arm( } #endif default: - ret = NOT_SUPPORTED; break; } + return ret; +} +EE logsoftmax_arm( + TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output) +{ + EE ret = NOT_SUPPORTED; + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = logsoftmax_fp32(inputDesc, (const F32 *)input, p.axis, outputDesc, (F32 *)output); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: { + ret = logsoftmax_fp16(inputDesc, (const F16 *)input, p.axis, outputDesc, (F16 *)output); + break; + } +#endif + default: + break; + } return ret; } diff --git a/compute/tensor/src/cpu/arm/tensor_computing_arm.h b/compute/tensor/src/cpu/arm/tensor_computing_arm.h index 9aa6ab5c..1329c7b7 100644 --- a/compute/tensor/src/cpu/arm/tensor_computing_arm.h +++ b/compute/tensor/src/cpu/arm/tensor_computing_arm.h @@ -179,8 +179,13 @@ EE rnncell_arm(TensorDesc xDesc, void *currentH, Arch arch); -EE layer_normalization_arm( - TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output); +EE layer_normalization_arm(TensorDesc inputDesc, + void *input, + LayerNormParamSpec p, + void *alpha, + void *beta, + TensorDesc outputDesc, + void *output); EE pooling_arm(TensorDesc inputDesc, const void *input, @@ -208,6 +213,9 @@ EE scale_arm(TensorDesc inputDesc, EE softmax_arm( TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output); +EE logsoftmax_arm( + TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output); + EE check_arm(TensorDesc inputDescA, const void *inputA, TensorDesc inputDescB, diff --git a/compute/tensor/src/cpu/arm/transform_functions.h b/compute/tensor/src/cpu/arm/transform_functions.h index 8b191890..3fc9b27a 100644 --- a/compute/tensor/src/cpu/arm/transform_functions.h +++ b/compute/tensor/src/cpu/arm/transform_functions.h @@ -163,12 +163,12 @@ template inline T *convolution_input_padding_per_channel( U32 n, U32 ic, U32 it, U32 ih, U32 iw, const ConvolutionParamSpec &p, T *src, T *dst) { - U32 it_pad = it + p.padding_before + p.padding_after; - U32 ih_pad = ih + p.padding_top + p.padding_bottom; - U32 iw_pad = iw + p.padding_left + p.padding_right; + U32 it_pad = it + p.pad_before + p.pad_after; + U32 ih_pad = ih + p.pad_top + p.pad_bottom; + U32 iw_pad = iw + p.pad_left + p.pad_right; T *inArray_pad; - if (p.padding_before == 0 && p.padding_after == 0 && p.padding_top == 0 && - p.padding_bottom == 0 && p.padding_left == 0 && p.padding_right == 0) { + if (p.pad_before == 0 && p.pad_after == 0 && p.pad_top == 0 && p.pad_bottom == 0 && + p.pad_left == 0 && p.pad_right == 0) { T *inArray_mov = src + n * ic * it * ih * iw * CAlignSize; inArray_pad = inArray_mov; } else { @@ -179,25 +179,25 @@ inline T *convolution_input_padding_per_channel( for (U32 c = 0; c < ic; c++) { T *inArray_mov = src + (n * ic + c) * it * ih * iw * CAlignSize; T *inArray_pad_mov = inArray_pad + c * it_pad * ih_pad * iw_pad * CAlignSize; - memset(inArray_pad_mov, 0, p.padding_before * ih_pad * iw_pad * CAlignSize * sizeof(T)); - inArray_pad_mov += p.padding_before * ih_pad * iw_pad * CAlignSize; - for (U32 t = p.padding_before; t < it_pad - p.padding_after; t++) { - memset(inArray_pad_mov, 0, p.padding_top * iw_pad * CAlignSize * sizeof(T)); - inArray_pad_mov += p.padding_top * iw_pad * CAlignSize; - for (U32 h = p.padding_top; h < ih_pad - p.padding_bottom; h++) { - memset(inArray_pad_mov, 0, p.padding_left * CAlignSize * sizeof(T)); - inArray_pad_mov += p.padding_left * CAlignSize; - memcpy(inArray_pad_mov, inArray_mov, iw * CAlignSize * sizeof(T)); + UNI_MEMSET(inArray_pad_mov, 0, p.pad_before * ih_pad * iw_pad * CAlignSize * sizeof(T)); + inArray_pad_mov += p.pad_before * ih_pad * iw_pad * CAlignSize; + for (U32 t = p.pad_before; t < it_pad - p.pad_after; t++) { + UNI_MEMSET(inArray_pad_mov, 0, p.pad_top * iw_pad * CAlignSize * sizeof(T)); + inArray_pad_mov += p.pad_top * iw_pad * CAlignSize; + for (U32 h = p.pad_top; h < ih_pad - p.pad_bottom; h++) { + UNI_MEMSET(inArray_pad_mov, 0, p.pad_left * CAlignSize * sizeof(T)); + inArray_pad_mov += p.pad_left * CAlignSize; + UNI_MEMCPY(inArray_pad_mov, inArray_mov, iw * CAlignSize * sizeof(T)); inArray_pad_mov += iw * CAlignSize; inArray_mov += iw * CAlignSize; - memset(inArray_pad_mov, 0, p.padding_right * CAlignSize * sizeof(T)); - inArray_pad_mov += p.padding_right * CAlignSize; + UNI_MEMSET(inArray_pad_mov, 0, p.pad_right * CAlignSize * sizeof(T)); + inArray_pad_mov += p.pad_right * CAlignSize; } - memset(inArray_pad_mov, 0, p.padding_bottom * iw_pad * CAlignSize * sizeof(T)); - inArray_pad_mov += p.padding_bottom * iw_pad * CAlignSize; + UNI_MEMSET(inArray_pad_mov, 0, p.pad_bottom * iw_pad * CAlignSize * sizeof(T)); + inArray_pad_mov += p.pad_bottom * iw_pad * CAlignSize; } - memset(inArray_pad_mov, 0, p.padding_after * ih_pad * iw_pad * CAlignSize * sizeof(T)); - inArray_pad_mov += p.padding_after * ih_pad * iw_pad * CAlignSize; + UNI_MEMSET(inArray_pad_mov, 0, p.pad_after * ih_pad * iw_pad * CAlignSize * sizeof(T)); + inArray_pad_mov += p.pad_after * ih_pad * iw_pad * CAlignSize; } } return inArray_pad; diff --git a/compute/tensor/src/cpu/cast.cpp b/compute/tensor/src/cpu/cast.cpp new file mode 100644 index 00000000..5ff4bcab --- /dev/null +++ b/compute/tensor/src/cpu/cast.cpp @@ -0,0 +1,100 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" + +template +static void cast_kernel(U32 len, TI *input, TO *output) +{ + for (U32 i = 0; i < len; ++i) { + output[i] = (TO)(input[i]); + } +} + +template +static EE cast_kernel(U32 len, DataType odt, T *input, void *output) +{ + EE ret = SUCCESS; + switch (odt) { + case DT_I32: { + cast_kernel(len, input, (I32 *)output); + break; + } + case DT_U32: { + cast_kernel(len, input, (U32 *)output); + break; + } + case DT_F32: { + cast_kernel(len, input, (F32 *)output); + break; + } +#ifdef _USE_FP16 + case DT_F16: { + cast_kernel(len, input, (F16 *)output); + break; + } +#endif + case DT_U8: { + cast_kernel(len, input, (U8 *)output); + break; + } + case DT_I8: { + cast_kernel(len, input, (INT8 *)output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE cast_cpu(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output) +{ + DataType idt = inputDesc.dt; + DataType odt = outputDesc.dt; + U32 len = tensorNumElements(inputDesc); + EE ret; + switch (idt) { + case DT_F32: { + ret = cast_kernel(len, odt, (F32 *)input, output); + break; + } +#ifdef _USE_FP16 + case DT_F16: { + ret = cast_kernel(len, odt, (F16 *)input, output); + break; + } +#endif + case DT_U32: { + ret = cast_kernel(len, odt, (U32 *)input, output); + break; + } + case DT_I32: { + ret = cast_kernel(len, odt, (I32 *)input, output); + break; + } + case DT_U8: { + ret = cast_kernel(len, odt, (U8 *)input, output); + break; + } + case DT_I8: { + ret = cast_kernel(len, odt, (INT8 *)input, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/check.cpp b/compute/tensor/src/cpu/check.cpp new file mode 100644 index 00000000..434a85a3 --- /dev/null +++ b/compute/tensor/src/cpu/check.cpp @@ -0,0 +1,135 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" + +template +static inline EE check_kernel( + TensorDesc aDesc, TA *a, TensorDesc bDesc, TB *b, CheckParamSpec p, TensorDesc outDesc, U8 *out) +{ + int aLen = tensorNumElements(aDesc); + int bLen = tensorNumElements(bDesc); + int len = tensorNumElements(outDesc); + EE ret = SUCCESS; + for (int i = 0; i < len; i++) { + TA va = a[i % aLen]; + TB vb = b[i % bLen]; + switch (p.mode) { + case CHECK_GREATER: { + out[i] = (va > (TA)vb) ? 1 : 0; + break; + } + case CHECK_GREATER_EQUAL: { + out[i] = (va >= (TA)vb) ? 1 : 0; + break; + } + case CHECK_EQUAL: { + out[i] = (va == (TA)vb) ? 1 : 0; + break; + } + case CHECK_NOT_EQUAL: { + out[i] = (va != (TA)vb) ? 1 : 0; + break; + } + case CHECK_LESS: { + out[i] = (va < (TA)vb) ? 1 : 0; + break; + } + case CHECK_LESS_EQUAL: { + out[i] = (va <= (TA)vb) ? 1 : 0; + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + } + return ret; +} + +template +EE check_wrapper(TensorDesc inputDescA, + TA *inputA, + TensorDesc inputDescB, + void *inputB, + CheckParamSpec p, + TensorDesc outputDesc, + U8 *output) +{ + EE ret = SUCCESS; + switch (inputDescB.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = check_kernel( + inputDescA, inputA, inputDescB, (F32 *)inputB, p, outputDesc, output); + break; + } +#endif +#ifdef _USE_FP16 + case DT_F16: + ret = check_kernel( + inputDescA, inputA, inputDescB, (F16 *)inputB, p, outputDesc, output); + break; +#endif + case DT_U32: { + ret = check_kernel( + inputDescA, inputA, inputDescB, (U32 *)inputB, p, outputDesc, output); + break; + } + case DT_I32: { + ret = check_kernel( + inputDescA, inputA, inputDescB, (I32 *)inputB, p, outputDesc, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + +EE check_cpu(TensorDesc inputADesc, + void *inputA, + TensorDesc inputBDesc, + void *inputB, + CheckParamSpec p, + TensorDesc outputDesc, + void *output) +{ + EE ret = NOT_SUPPORTED; + switch (inputADesc.dt) { + case DT_U32: + ret = check_wrapper( + inputADesc, (U32 *)inputA, inputBDesc, inputB, p, outputDesc, (U8 *)output); + break; + case DT_I32: + ret = check_wrapper( + inputADesc, (I32 *)inputA, inputBDesc, inputB, p, outputDesc, (U8 *)output); + break; +#ifdef _USE_FP32 + case DT_F32: + ret = check_wrapper( + inputADesc, (F32 *)inputA, inputBDesc, inputB, p, outputDesc, (U8 *)output); + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + ret = check_wrapper( + inputADesc, (F16 *)inputA, inputBDesc, inputB, p, outputDesc, (U8 *)output); + break; +#endif + default: + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/concat.cpp b/compute/tensor/src/cpu/concat.cpp index 5927cdf3..b21f6461 100644 --- a/compute/tensor/src/cpu/concat.cpp +++ b/compute/tensor/src/cpu/concat.cpp @@ -41,7 +41,7 @@ inline static void concat_v1(const std::vector &inputDesc, U32 blockSize = inputDesc[j].dims[axis] * tileSize; if (!jumpMemcpy[j]) { U8 *srcPtr = ((U8 *)input[j]) + i * blockSize; - memcpy(dstPtr, srcPtr, blockSize); + UNI_MEMCPY(dstPtr, srcPtr, blockSize); } dstPtr += blockSize; } @@ -61,21 +61,23 @@ inline static void concat_v2(const std::vector &inputDesc, U8 *dstPtr = (U8 *)output; for (U32 i = 0; i < loops; i++) { for (U32 j = 0; j < num; j++) { - U32 blockSize = inputDesc[j].dims[axis] * tileSize; + I32 blockSize = inputDesc[j].dims[axis] * tileSize; if (!jumpMemcpy[j]) { U8 *srcPtr = ((U8 *)input[j]) + i * blockSize; #ifdef _USE_OPENMP - U32 bblockNum = OMP_NUM_THREADS; - U32 bblockSize = (blockSize + bblockNum - 1) / bblockNum; + I32 bblockNum = OMP_NUM_THREADS; + I32 bblockSize = (blockSize + bblockNum - 1) / bblockNum; + bblockSize = UNI_MIN(32, bblockSize); + bblockNum = (blockSize + bblockSize - 1) / bblockSize; -#pragma omp parallel for num_threads(OMP_NUM_THREADS) - for (U32 k = 0; k < bblockNum; ++k) { - U32 copyDst = k * bblockSize; - memcpy(dstPtr + copyDst, srcPtr + copyDst, +#pragma omp parallel for num_threads(OMP_NUM_THREADS) if (bblockNum >= OMP_NUM_THREADS) + for (I32 k = 0; k < bblockNum; ++k) { + I32 copyDst = k * bblockSize; + UNI_MEMCPY(dstPtr + copyDst, srcPtr + copyDst, UNI_MIN(bblockSize, blockSize - copyDst)); } #else - memcpy(dstPtr, srcPtr, blockSize); + UNI_MEMCPY(dstPtr, srcPtr, blockSize); #endif } dstPtr += blockSize; @@ -138,7 +140,9 @@ static EE concat(std::vector inputDesc, U8 *tmpPtr = (U8 *)tmp; U32 outputOff = 0; for (U32 j = 0; j < num; j++) { - if ((4 != inputDesc[j].nDims) || (1 != inputDesc[j].dims[1]) || (1 != inputDesc[j].dims[0])) { + if (((4 == inputDesc[j].nDims) && + ((1 != inputDesc[j].dims[1]) || (1 != inputDesc[j].dims[0]))) || + ((3 == inputDesc[j].nDims) && (1 != inputDesc[j].dims[0]))) { if (isC8 && (DF_NCHWC8 != inputDesc[j].df)) { TensorDesc tmpDesc = inputDesc[j]; tmpDesc.df = DF_NCHWC8; diff --git a/compute/tensor/src/cpu/cpu_functions_template.h b/compute/tensor/src/cpu/cpu_functions_template.h index b30c4471..67f3c7c9 100644 --- a/compute/tensor/src/cpu/cpu_functions_template.h +++ b/compute/tensor/src/cpu/cpu_functions_template.h @@ -63,9 +63,9 @@ inline void array_power_template(T *input, T *output, I32 len, F32 power) } template -EE activation_template(ActivationParamSpec activationDesc, F32 input, T *output) +inline EE activation_template(const ActivationParamSpec &activationDesc, const F32 &input, T *output) { - F32 value, result = 0; + F32 result = 0; EE ret = SUCCESS; switch (activationDesc.mode) { case ACTIVATION_NULL: { @@ -73,86 +73,52 @@ EE activation_template(ActivationParamSpec activationDesc, F32 input, T *output) break; } case ACTIVATION_RELU: { - value = input; - F32 tmp = activationDesc.value[0] * value; - if (value < tmp) { - value = tmp; - } - result = value; + result = UNI_MAX(activationDesc.value[0] * input, input); break; } case ACTIVATION_RELU6: { - value = input; - if (value < 0) { - value = 0; - } - if (value > 6) { - value = 6; - } - result = value; + result = UNI_MIN(UNI_MAX(input, 0), 6); break; } case ACTIVATION_H_SIGMOID: { - value = input + 3; - if (value < 0) { - value = 0; - } - if (value > 6) { - value = 6; - } - result = value / 6; + result = UNI_MIN(UNI_MAX(input + 3, 0), 6) / 6; break; } case ACTIVATION_H_SWISH: { - value = input + 3; - if (value < 0) { - value = 0; - } - if (value > 6) { - value = 6; - } - result = input * (value / 6); + result = UNI_MIN(UNI_MAX(input + 3, 0), 6) * input / 6; break; } case ACTIVATION_H_SWISH_NODIV: { - value = input + 3; - if (value < 0) { - value = 0; - } - if (value > 6) { - value = 6; - } - result = input * value; + result = UNI_MIN(UNI_MAX(input + 3, 0), 6) * input; break; } case ACTIVATION_GELU: { - value = input; - value = erf(value / sqrt(2)); + F32 value = erf(input / sqrt(2)); value = 0.5 * (1.0 + value); - value = input * value; - result = value; + result = input * value; break; } case ACTIVATION_TANH: { - value = 1.0 - 2.0 / (exp(2.0 * input) + 1.0); - result = value; + result = 1.0 - 2.0 / (exp(2.0 * input) + 1.0); break; } case ACTIVATION_SIGMOID: { - value = 1.0 / (1.0 + exp(-1.0 * input)); - result = value; + result = 1.0 / (1.0 + exp(-1.0 * input)); + break; + } + case ACTIVATION_SWISH: { + result = input / (1.0 + exp(-1.0 * input)); break; } case ACTIVATION_MISH: { - value = input; + F32 value = input; F32 mish_threshold = 20; if (value < -mish_threshold) { value = exp(value); } else if (!(value > mish_threshold || value < -mish_threshold)) { value = log(exp(value) + 1.0); } - value = input * tanh(value); - result = value; + result = input * tanh(value); break; } case ACTIVATION_SOFTPLUS: { @@ -183,6 +149,22 @@ EE activation_template(ActivationParamSpec activationDesc, F32 input, T *output) result = -input; break; } + case ACTIVATION_ROUND: { + result = round(input); + break; + } + case ACTIVATION_CEIL: { + result = ceil(input); + break; + } + case ACTIVATION_FLOOR: { + result = floor(input); + break; + } + case ACTIVATION_RECIPROCAL: { + result = 1 / input; + break; + } default: ret = NOT_SUPPORTED; break; diff --git a/compute/tensor/src/cpu/deconvolution.cpp b/compute/tensor/src/cpu/deconvolution.cpp index cfee9b10..60e33f1a 100644 --- a/compute/tensor/src/cpu/deconvolution.cpp +++ b/compute/tensor/src/cpu/deconvolution.cpp @@ -50,7 +50,7 @@ EE deconvolution_infer_forward_algorithm_cpu(TensorDesc inputDesc, } #ifdef _USE_X86 - if (IS_X86(arch) && idf == DF_NCHWC8 && (fc * 2 < ic || fc < 128)) { + if (IS_X86(arch) && idf == DF_NCHWC8) { *algorithm = CONVOLUTION_ALGORITHM_POINTWISE; return SUCCESS; } @@ -144,23 +144,21 @@ EE deconvolution_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc, if (algorithm == CONVOLUTION_ALGORITHM_IM2COL_GEMM) { TensorDesc matrixADesc = tensor2df(idt, DF_NKN8, ic, in * ih * iw); TensorDesc matrixBDesc = tensor2df(idt, DF_NORMAL, ic, oc * fh * fw); - CHECK_STATUS(matrix_matrix_multiply_tmp_bytes(matrixADesc, matrixBDesc, bytes, X86_AVX2)); + CHECK_STATUS(matrix_matrix_multiply_tmp_bytes(matrixADesc, matrixBDesc, bytes, arch)); *bytes += in * ih * iw * oc * fh * fw * bytesOf(idt); -#ifdef _USE_NEON - if (IS_ARM(arch) && idf == DF_NCHWC8) { + if (!IS_X86(arch) || idf != DF_NCHWC8 || in > 1) { *bytes += in * ih * iw * ic * bytesOf(idt); } *bytes += 32; -#endif return SUCCESS; } U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 tPadding = fh - 1 - paddingT; U32 bPadding = fh - 1 - paddingB; @@ -197,37 +195,37 @@ EE deconvolution_gemm(TensorDesc inputDesc, U32 fh = convParamSpec.kernel_h; U32 fw = convParamSpec.kernel_w; - TensorDesc matrixADesc = tensor2df(idt, DF_TRANSPOSE, ic, in * ih * iw); - if (idf == DF_NCHWC8) { - if (IS_X86(arch)) { - matrixADesc = tensor2df(idt, DF_NKN8, ic, in * ih * iw); - } else { - TensorDesc tmpDesc = tensor4df(odt, DF_NCHW, in, ic, ih, iw); - U8 *tmpInput = (U8 *)tmp; - transformToNCHW(inputDesc, input, tmpDesc, tmpInput); - input = tmpInput; - tmp = (void *)(tmpInput + in * ic * iw * ih * bytesOf(idt)); - } + TensorDesc matrixADesc = tensor2df(idt, DF_NORMAL, in * ih * iw, ic); + if (IS_X86(arch) && idf == DF_NCHWC8 && in == 1) { + matrixADesc = tensor2df(idt, DF_NKN8, ic, in * ih * iw); + } else { + TensorDesc tmpDesc = tensor4df(odt, DF_NHWC, in, ic, ih, iw); + U8 *tmpInput = (U8 *)tmp; + transformFormat(inputDesc, input, tmpDesc, tmpInput); + input = tmpInput; + tmp = (void *)(tmpInput + in * ic * iw * ih * bytesOf(idt)); } TensorDesc matrixCDesc = tensor2df(odt, DF_NORMAL, in * ih * iw, fw * fh * oc); U8 *tmpOutput = (U8 *)tmp; - tmpOutput += in * ih * iw * ic * bytesOf(idt); + tmp = (void *)(tmpOutput + in * ih * iw * fw * fh * oc * bytesOf(idt)); - memset(tmpOutput, 0, in * ih * iw * fw * fh * oc * bytesOf(idt)); + UNI_MEMSET(tmpOutput, 0, in * ih * iw * fw * fh * oc * bytesOf(idt)); CHECK_STATUS(matrix_matrix_multiply(matrixADesc, input, filterDesc, filter, tmpBytes, tmp, matrixCDesc, tmpOutput, nullptr, arch)); U8 *tmpOutputPtr = (U8 *)output; U32 biasTileSize = bytesOf(biasDesc.dt) * 8; - U8 *biasPtr = (U8 *)bias; - for (U32 c = 0; c < oc / 8; c++, biasPtr += biasTileSize) { - for (U32 n = 0; n < oh * ow; n++) { - memcpy(tmpOutputPtr, biasPtr, biasTileSize); - tmpOutputPtr += biasTileSize; + for (U32 n = 0; n < on; ++n) { + U8 *biasPtr = (U8 *)bias; + for (U32 c = 0; c < oc / 8; c++, biasPtr += biasTileSize) { + for (U32 hw = 0; hw < oh * ow; hw++) { + UNI_MEMCPY(tmpOutputPtr, biasPtr, biasTileSize); + tmpOutputPtr += biasTileSize; + } } } - EE ret = NOT_SUPPORTED; + EE ret = SUCCESS; if (IS_ARM(arch)) { #ifdef _USE_NEON ret = @@ -299,18 +297,18 @@ EE deconvolution_cpu(TensorDesc inputDesc, U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; ConvolutionParamSpec transposedCD = convParamSpec; transposedCD.stride_h = 1; transposedCD.stride_w = 1; - transposedCD.padding_top = 0; - transposedCD.padding_bottom = 0; - transposedCD.padding_left = 0; - transposedCD.padding_right = 0; + transposedCD.pad_top = 0; + transposedCD.pad_bottom = 0; + transposedCD.pad_left = 0; + transposedCD.pad_right = 0; transposedCD.dilatedRate_h = 1; transposedCD.dilatedRate_w = 1; @@ -323,69 +321,73 @@ EE deconvolution_cpu(TensorDesc inputDesc, U32 stuffW = strideW - 1; U32 ihPadded = ih + (ih - 1) * stuffH + tPadding + bPadding; U32 iwPadded = iw + (iw - 1) * stuffW + lPadding + rPadding; - TensorDesc inPaddedDesc = tensor4df(idt, idf, in, ic, ihPadded, iwPadded); + TensorDesc inPaddedDesc = tensor4df(idt, idf, 1, ic, ihPadded, iwPadded); + TensorDesc singleOutputDesc = tensor4df(idt, idf, 1, oc, oh, ow); - U8 *inPad = (U8 *)tmp; - U8 *inPadMov = inPad; - U8 *inputMov = (U8 *)input; U32 memUnit = 8 * bytesOf(idt); + U32 ic8 = ic / 8; + EE ret = NOT_SUPPORTED; + TensorDesc blankTensorDesc; + ActivationParamSpec blankActivationParamSpec; - ic /= 8; + for (U32 n = 0; n < in; ++n) { + U8 *inputMov = (U8 *)input + n * ih * iw * ic * bytesOf(idt); + U8 *outputMov = (U8 *)output + n * oh * ow * oc * bytesOf(odt); + U8 *inPad = (U8 *)tmp; + U8 *inPadMov = inPad; - for (U32 c = 0; c < ic; c++) { - for (U32 h = 0; h < tPadding; h++) { - memset(inPadMov, 0, iwPadded * memUnit); - inPadMov += iwPadded * memUnit; - } - for (U32 h = 0; h < ih - 1; h++) { - memset(inPadMov, 0, lPadding * memUnit); + for (U32 c = 0; c < ic8; c++) { + for (U32 h = 0; h < tPadding; h++) { + UNI_MEMSET(inPadMov, 0, iwPadded * memUnit); + inPadMov += iwPadded * memUnit; + } + for (U32 h = 0; h < ih - 1; h++) { + UNI_MEMSET(inPadMov, 0, lPadding * memUnit); + inPadMov += lPadding * memUnit; + for (U32 w = 0; w < iw - 1; w++) { + UNI_MEMCPY(inPadMov, inputMov, memUnit); + inPadMov += memUnit; + inputMov += memUnit; + UNI_MEMSET(inPadMov, 0, stuffW * memUnit); + inPadMov += stuffW * memUnit; + } + UNI_MEMCPY(inPadMov, inputMov, memUnit); + inPadMov += memUnit; + inputMov += memUnit; + UNI_MEMSET(inPadMov, 0, rPadding * memUnit); + inPadMov += rPadding * memUnit; + + // stuffH + UNI_MEMSET(inPadMov, 0, iwPadded * stuffH * memUnit); + inPadMov += iwPadded * stuffH * memUnit; + } + UNI_MEMSET(inPadMov, 0, lPadding * memUnit); inPadMov += lPadding * memUnit; for (U32 w = 0; w < iw - 1; w++) { - memcpy(inPadMov, inputMov, memUnit); + UNI_MEMCPY(inPadMov, inputMov, memUnit); inPadMov += memUnit; inputMov += memUnit; - memset(inPadMov, 0, stuffW * memUnit); + UNI_MEMSET(inPadMov, 0, stuffW * memUnit); inPadMov += stuffW * memUnit; } - memcpy(inPadMov, inputMov, memUnit); + UNI_MEMCPY(inPadMov, inputMov, memUnit); inPadMov += memUnit; inputMov += memUnit; - memset(inPadMov, 0, rPadding * memUnit); + UNI_MEMSET(inPadMov, 0, rPadding * memUnit); inPadMov += rPadding * memUnit; - // stuffH - memset(inPadMov, 0, iwPadded * stuffH * memUnit); - inPadMov += iwPadded * stuffH * memUnit; - } - memset(inPadMov, 0, lPadding * memUnit); - inPadMov += lPadding * memUnit; - for (U32 w = 0; w < iw - 1; w++) { - memcpy(inPadMov, inputMov, memUnit); - inPadMov += memUnit; - inputMov += memUnit; - memset(inPadMov, 0, stuffW * memUnit); - inPadMov += stuffW * memUnit; - } - memcpy(inPadMov, inputMov, memUnit); - inPadMov += memUnit; - inputMov += memUnit; - memset(inPadMov, 0, rPadding * memUnit); - inPadMov += rPadding * memUnit; - - for (U32 h = ihPadded - bPadding; h < ihPadded; h++) { - memset(inPadMov, 0, iwPadded * memUnit); - inPadMov += iwPadded * memUnit; + for (U32 h = ihPadded - bPadding; h < ihPadded; h++) { + UNI_MEMSET(inPadMov, 0, iwPadded * memUnit); + inPadMov += iwPadded * memUnit; + } } - } - EE ret = NOT_SUPPORTED; - TensorDesc blankTensorDesc; - ActivationParamSpec blankActivationParamSpec; - ret = depthwise_pointwise_convolution_cpu(inPaddedDesc, inPad, filterDesc, filter, - blankTensorDesc, nullptr, transposedCD, DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT, - biasDesc, bias, blankTensorDesc, nullptr, tmpBytes - tensorNumBytes(inPaddedDesc), - inPad + tensorNumBytes(inPaddedDesc), outputDesc, output, activationDesc, - blankActivationParamSpec, arch); + ret = depthwise_pointwise_convolution_cpu(inPaddedDesc, inPad, filterDesc, filter, + blankTensorDesc, nullptr, transposedCD, + DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT, biasDesc, bias, blankTensorDesc, + nullptr, tmpBytes - tensorNumBytes(inPaddedDesc), inPad + tensorNumBytes(inPaddedDesc), + singleOutputDesc, outputMov, activationDesc, blankActivationParamSpec, arch); + } return ret; } diff --git a/compute/tensor/src/cpu/depth2space.cpp b/compute/tensor/src/cpu/depth2space.cpp new file mode 100644 index 00000000..7ca67dfd --- /dev/null +++ b/compute/tensor/src/cpu/depth2space.cpp @@ -0,0 +1,88 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" + +template +static inline EE depth2space_kernel( + TensorDesc inputDesc, T *input, Depth2SpaceParamSpec p, TensorDesc outputDesc, T *output) +{ + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw; + U32 on, oc, oh, ow; + int bh = p.block_size; + int bw = p.block_size; + if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + } else if (tensorIs3d(inputDesc)) { + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ic, &ih)); + CHECK_STATUS(tensor3dGet(outputDesc, &odt, &odf, &on, &oc, &oh)); + iw = ow = 1; + bw = 1; + } else { + return NOT_SUPPORTED; + } + + int cx = 1; + if (idf == DF_NCHWC8) { + cx = 8; + } + if (idf == DF_NCHWC16) { + cx = 16; + } + U32 icx = ic / cx; + for (U32 n = 0, o_i = 0; n < in; n++) { + for (U32 c = 0; c < oc; c++) { + for (U32 h = 0; h < ih; h++) { + for (int i = 0; i < bh; i++) { + for (U32 w = 0; w < iw; w++) { + for (int j = 0; j < bw; j++, o_i++) { + int i_c = (c * bh + i) * bw + j; + int c1 = i_c / cx; + int c2 = i_c % cx; + int i_i = (((n * icx + c1) * ih + h) * iw + w) * cx + c2; + output[o_i] = input[i_i]; + } + } + } + } + } + } + return SUCCESS; +} + +EE depth2space_cpu( + TensorDesc inputDesc, void *input, Depth2SpaceParamSpec p, TensorDesc outputDesc, void *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + EE ret = NOT_SUPPORTED; + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: + ret = depth2space_kernel(inputDesc, (F32 *)input, p, outputDesc, (F32 *)output); + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + ret = depth2space_kernel(inputDesc, (F16 *)input, p, outputDesc, (F16 *)output); + break; +#endif + default: + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/depthwise_pointwise_convolution.cpp index b7d70f00..caf10831 100644 --- a/compute/tensor/src/cpu/depthwise_pointwise_convolution.cpp +++ b/compute/tensor/src/cpu/depthwise_pointwise_convolution.cpp @@ -53,7 +53,7 @@ EE depthwise_pointwise_convolution_cpu(TensorDesc inputDesc, #ifdef _USE_X86 } else if (IS_X86(arch)) { ret = depthwise_pointwise_convolution_x86(inputDesc, input, nullptr, dwFilterDesc, dwFilter, - pwFilterDesc, pwFilter, convParamSpec, algorithm, dwBiasDesc, dwBias, pwBiasDesc, + pwFilterDesc, pwFilter, convParamSpec, algorithm, nullptr, dwBiasDesc, dwBias, pwBiasDesc, pwBias, tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, pointwiseActivationParamSpec, arch); #endif diff --git a/compute/tensor/src/cpu/detectionoutput.cpp b/compute/tensor/src/cpu/detectionoutput.cpp index 9695c638..aa2eebc1 100644 --- a/compute/tensor/src/cpu/detectionoutput.cpp +++ b/compute/tensor/src/cpu/detectionoutput.cpp @@ -11,86 +11,8 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include "error.h" #include "cpu/tensor_computing_cpu.h" - -inline EE qsort_descent(std::vector &boxes, std::vector &scores, int left, int right) -{ - if (boxes.empty() || scores.empty()) { - return NOT_SUPPORTED; - } - - int i = left; - int j = right; - F32 temp = scores[(left + right) / 2]; - - while (i <= j) { - while (scores[i] > temp) { - i++; - } - while (scores[j] < temp) { - j--; - } - if (i <= j) { - std::swap(boxes[i], boxes[j]); - std::swap(scores[i], scores[j]); - i++; - j--; - } - } - - if (left < j) { - qsort_descent(boxes, scores, left, j); - } - if (i < right) { - qsort_descent(boxes, scores, i, right); - } - - return SUCCESS; -} - -inline F32 intersectionarea(BoxRect a, BoxRect b) -{ - if (a.xmin > b.xmax || a.xmax < b.xmin || a.ymin > b.ymax || a.ymax < b.ymin) { - return 0.f; - } - F32 inter_width = std::min(a.xmax, b.xmax) - std::max(a.xmin, b.xmin); - F32 inter_height = std::min(a.ymax, b.ymax) - std::max(a.ymin, b.ymin); - - return inter_width * inter_height; -} - -inline EE nms_pickedboxes(std::vector boxes, std::vector &picked, F32 nms_threshold) -{ - I64 n = boxes.size(); - - std::vector areas(n); - for (I64 i = 0; i < n; i++) { - BoxRect box = boxes[i]; - - F32 width = box.xmax - box.xmin; - F32 height = box.ymax - box.ymin; - - areas[i] = width * height; - } - for (I64 i = 0; i < n; i++) { - BoxRect a = boxes[i]; - int keep = 1; - for (int j = 0; j < (int)picked.size(); j++) { - BoxRect b = boxes[picked[j]]; - F32 inter_area = intersectionarea(a, b); - F32 union_area = areas[i] + areas[picked[j]] - inter_area; - - if (inter_area / union_area > nms_threshold) { - keep = 0; - } - } - if (keep) { - picked.push_back(i); - } - } - return SUCCESS; -} +#include "cpu/non_max_suppression.h" template EE detectionoutput_kernel(std::vector input, @@ -138,71 +60,54 @@ EE detectionoutput_kernel(std::vector input, boxes[i].assign(box.begin(), box.end()); } - std::vector> allclass_boxrects; - std::vector> allclass_boxscores; - allclass_boxrects.resize(numclass); - allclass_boxscores.resize(numclass); - + std::vector> allclass_boxrects(numclass); for (U32 i = 1; i < numclass; i++) { std::vector class_boxrects; - std::vector class_boxscores; for (U32 j = 0; j < num_total_priorbox; j++) { F32 score = confidence[j * numclass + i]; if (score > confidence_threshold) { std::vector inbox; inbox.assign(boxes[j].begin(), boxes[j].end()); - BoxRect b = {inbox[0], inbox[1], inbox[2], inbox[3], i}; + BoxRect b = {inbox[0], inbox[1], inbox[2], inbox[3], i, score, j}; class_boxrects.push_back(b); - class_boxscores.push_back(score); } } // sort the boxes with scores - qsort_descent( - class_boxrects, class_boxscores, 0, static_cast(class_boxscores.size() - 1)); + std::stable_sort(class_boxrects.begin(), class_boxrects.end(), + [&](const BoxRect &a, const BoxRect &b) { return (a.score > b.score); }); - if (nms_top_k < (U32)class_boxrects.size()) { + if (nms_top_k < class_boxrects.size()) { class_boxrects.resize(nms_top_k); - class_boxscores.resize(nms_top_k); } // apply nms - std::vector picked; - nms_pickedboxes(class_boxrects, picked, nms_threshold); - - for (I64 j = 0; j < (I64)picked.size(); j++) { + std::vector picked = nms_pickedboxes(class_boxrects, nms_threshold); + for (U32 j = 0; j < picked.size(); j++) { I64 picked_box = picked[j]; allclass_boxrects[i].push_back(class_boxrects[picked_box]); - allclass_boxscores[i].push_back(class_boxscores[picked_box]); } } std::vector boxrects; - std::vector boxscores; - for (U32 i = 1; i < numclass; i++) { boxrects.insert(boxrects.end(), allclass_boxrects[i].begin(), allclass_boxrects[i].end()); - boxscores.insert( - boxscores.end(), allclass_boxscores[i].begin(), allclass_boxscores[i].end()); } - qsort_descent(boxrects, boxscores, 0, static_cast(boxscores.size() - 1)); - + std::stable_sort(boxrects.begin(), boxrects.end(), + [&](const BoxRect &a, const BoxRect &b) { return (a.score > b.score); }); if (keep_top_k < (U32)boxrects.size()) { boxrects.resize(keep_top_k); - boxscores.resize(keep_top_k); } - U32 num_detected = static_cast(boxrects.size()); + U32 num_detected = boxrects.size(); // the first box contains the number of availble boxes in the first element. output[0] = num_detected; output[1] = output[2] = output[3] = output[4] = output[5] = 0; for (U32 i = 0; i < num_detected; i++) { BoxRect b = boxrects[i]; - F32 score = boxscores[i]; - output[(i + 1) * 6] = b.label; - output[(i + 1) * 6 + 1] = score; + output[(i + 1) * 6 + 1] = b.score; output[(i + 1) * 6 + 2] = b.xmin; output[(i + 1) * 6 + 3] = b.ymin; output[(i + 1) * 6 + 4] = b.xmax; diff --git a/compute/tensor/src/cpu/eltwise.cpp b/compute/tensor/src/cpu/eltwise.cpp index fd3bd34f..f369d0eb 100644 --- a/compute/tensor/src/cpu/eltwise.cpp +++ b/compute/tensor/src/cpu/eltwise.cpp @@ -34,24 +34,90 @@ static std::vector calculateRelativeLocalIndex_cpu(U32 *indexes, U32 *dims, return relativeIndexes; } -// [1, 10, 10] + [1, 10, 10] = [1, 10, 10] -// [1, 10, 1] + [1, 1, 10] = [1, 10, 10] -// [1, 20, 10] + [10] = [1. 20, 10] + [1, 1, 10] = [1, 20, 10] -EE eltwise_cpu(std::vector inputDesc, - std::vector input_, - EltwiseParamSpec eltwiseDesc, - U32 tmpBytes, - void *tmp, - TensorDesc outputDesc, - void *output, - Arch arch) +static void get_dim_nonone_bound(TensorDesc desc, int *left, int *right) { - U32 num = inputDesc.size(); - if (num <= 1 || outputDesc.nDims < 1) { - return NOT_MATCH; + *left = -1; + for (U32 i = 0; i < desc.nDims; i++) { + if (desc.dims[i] == 1) { + *left = i; + } else { + break; + } + } + *right = desc.nDims; + for (I32 i = desc.nDims - 1; i >= 0; i--) { + if (desc.dims[i] == 1) { + *right = i; + } else { + break; + } } - std::vector input = input_; + *left = *left + 1; + *right = *right - 1; +} +static int scale_axis( + std::vector inputDesc, TensorDesc outputDesc, int *scaleId, TensorDesc *scaleDesc) +{ + if (inputDesc.size() != 2) { + return -1; + } + int al, ar, bl, br; + get_dim_nonone_bound(inputDesc[0], &al, &ar); + get_dim_nonone_bound(inputDesc[1], &bl, &br); + // use power operator + if (al > ar) { + return -2; + } + if (bl > br) { + return -3; + } + int cl = UNI_MIN(al, bl); + int cr = UNI_MAX(ar, br); + int alpha = -1; + if (cr - cl > ar - al) { + alpha = 0; + } + if (cr - cl > br - bl) { + alpha = 1; + } + if (alpha < 0) { + return -1; + } + int dl = UNI_MAX(al, bl); + int dr = UNI_MIN(ar, br); + for (int i = dl; i <= dr; i++) { + if (inputDesc[0].dims[i] != inputDesc[1].dims[i]) { + return -1; + } + } + int axis = cr - dr; + *scaleId = 1 - alpha; + *scaleDesc = inputDesc[*scaleId]; + scaleDesc->nDims = (dl - cl) + (cr - dr) + 1; + int j = 0; + for (int i = cl; i < dl; i++) { + scaleDesc->dims[j++] = inputDesc[*scaleId].dims[i]; + } + scaleDesc->dims[j] = 1; + for (int i = dl; i <= dr; i++) { + scaleDesc->dims[j] *= inputDesc[*scaleId].dims[i]; + } + for (int i = dr + 1; i <= cr; i++) { + scaleDesc->dims[++j] = inputDesc[*scaleId].dims[i]; + } + if (dr == cr) { + scaleDesc->dims[++j] = 1; + scaleDesc->nDims++; + axis++; + } + return axis; +} + +static void align_param( + std::vector &inputDesc, std::vector &input, void *tmp, TensorDesc &outputDesc) +{ + U32 num = inputDesc.size(); U8 *ptr = (U8 *)tmp; std::set nchw = {DF_NORMAL, DF_MTK, DF_MKT, DF_NCHW}; for (U32 i = 0; i < num; i++) { @@ -66,103 +132,176 @@ EE eltwise_cpu(std::vector inputDesc, inputDesc[i] = tensor4df(inputDesc[i].dt, DF_NHWC, inputDesc[i].dims[2], inputDesc[i].dims[0], inputDesc[i].dims[1], 1); } - CHECK_STATUS(transformFormat(inputDesc[i], input[i], outputDesc, ptr)); - inputDesc[i] = outputDesc; + TensorDesc tmpDesc = outputDesc; + if (tensorNumElements(inputDesc[i]) < tensorNumElements(outputDesc)) { + tmpDesc = inputDesc[i]; + tmpDesc.df = outputDesc.df; + } + CHECK_STATUS(transformFormat(inputDesc[i], input[i], tmpDesc, ptr)); + inputDesc[i] = tmpDesc; input[i] = ptr; - ptr += tensorNumBytes(outputDesc); + ptr += tensorNumBytes(tmpDesc); } } I32 oneCount = 0; - for (int i = 0; i < ((int)outputDesc.nDims) - 1; i++) { + for (int i = 0; i < (int)outputDesc.nDims - 1; i++) { if (outputDesc.dims[i] == 1) { oneCount++; } else { break; } } - TensorDesc newOutputDesc = outputDesc; + for (int i = 0; i < (int)outputDesc.nDims - oneCount; i++) { - newOutputDesc.dims[i] = outputDesc.dims[oneCount + i]; + outputDesc.dims[i] = outputDesc.dims[oneCount + i]; } - newOutputDesc.nDims = outputDesc.nDims - oneCount; + outputDesc.nDims = outputDesc.nDims - oneCount; - std::vector newInputDesc(num); for (U32 i = 0; i < num; i++) { - newInputDesc[i] = inputDesc[i]; + TensorDesc desc = inputDesc[i]; for (int j = 0; j < (int)inputDesc[i].nDims - oneCount; j++) { - newInputDesc[i].dims[j] = inputDesc[i].dims[oneCount + j]; + desc.dims[j] = inputDesc[i].dims[oneCount + j]; } - newInputDesc[i].nDims = inputDesc[i].nDims - oneCount; - for (U32 j = newInputDesc[i].nDims; j < newOutputDesc.nDims; j++) { - newInputDesc[i].dims[j] = 1; + desc.nDims = inputDesc[i].nDims - oneCount; + for (U32 j = desc.nDims; j < outputDesc.nDims; j++) { + desc.dims[j] = 1; } - newInputDesc[i].nDims = newOutputDesc.nDims; + desc.nDims = outputDesc.nDims; + inputDesc[i] = desc; } - U32 size = tensorNumElements(newOutputDesc); - int lastDimSize = newOutputDesc.dims[0]; +} + +static EE eltwise_kernel(std::vector inputDesc, + std::vector input, + EltwiseParamSpec p, + TensorDesc outputDesc, + void *output, + Arch arch) +{ + U32 num = inputDesc.size(); + int lastDimSize = outputDesc.dims[0]; std::vector lastDimSizes(num); bool sameDim = true; for (U32 i = 0; i < num; i++) { - lastDimSizes[i] = newInputDesc[i].dims[0]; + lastDimSizes[i] = inputDesc[i].dims[0]; if (lastDimSizes[i] != lastDimSize) { sameDim = false; - if (newInputDesc[0].df == DF_NCHWC8 || newInputDesc[0].df == DF_NCHWC16) { + if (inputDesc[0].df == DF_NCHWC8 || inputDesc[0].df == DF_NCHWC16) { UNI_ERROR_LOG("For NCHWC8 and NCHWC16, eltwise can only handle inputs with " "matching widths\n"); } } } - for (U32 i = 1; i < newOutputDesc.nDims; i++) { + for (U32 i = 1; i < outputDesc.nDims; i++) { for (U32 j = 0; j < num; j++) { - if (newInputDesc[j].dims[i] != newOutputDesc.dims[i]) { + if (inputDesc[j].dims[i] != outputDesc.dims[i]) { sameDim = false; break; } } if (sameDim) { - lastDimSize *= newOutputDesc.dims[i]; + lastDimSize *= outputDesc.dims[i]; for (U32 j = 0; j < num; j++) { - lastDimSizes[j] *= newInputDesc[j].dims[i]; + lastDimSizes[j] *= inputDesc[j].dims[i]; } } else { break; } } - std::vector newInput(num); EE ret = NOT_SUPPORTED; - for (U32 i = 0; i < size; i += lastDimSize) { - std::vector index = calculateLocalIndex(i, newOutputDesc.dims, newOutputDesc.nDims); + if (sameDim) { // if merged to the next loop, it will be slower when using openmp. + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = eltwise_general(outputDesc.dt, input, lastDimSizes, num, lastDimSize, output, p.mode); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = eltwise_arm(outputDesc.dt, input, lastDimSizes, num, lastDimSize, output, p.mode); +#endif +#ifdef _USE_X86 + } else if (IS_X86(arch)) { + ret = eltwise_x86(outputDesc.dt, input, lastDimSizes, num, lastDimSize, output, p.mode); +#endif + } + return ret; + } + + U32 loopNum = tensorNumElements(outputDesc) / lastDimSize; +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < loopNum; ++i) { + std::vector index = calculateLocalIndex(i * lastDimSize, outputDesc.dims, outputDesc.nDims); + std::vector ip(num); for (U32 j = 0; j < num; j++) { std::vector relativeIndex = calculateRelativeLocalIndex_cpu( - index.data(), newInputDesc[j].dims, newInputDesc[j].nDims); - U32 globalIndex = calculateGlobalIndex( - relativeIndex.data(), newInputDesc[j].dims, newInputDesc[j].nDims); - newInput[j] = (U8 *)(input[j]) + globalIndex * bytesOf(newInputDesc[j].dt); + index.data(), inputDesc[j].dims, inputDesc[j].nDims); + U32 globalIndex = + calculateGlobalIndex(relativeIndex.data(), inputDesc[j].dims, inputDesc[j].nDims); + ip[j] = (U8 *)(input[j]) + globalIndex * bytesOf(inputDesc[j].dt); } - U8 *newOutput = (U8 *)output + i * bytesOf(newOutputDesc.dt); + U8 *op = (U8 *)output + i * lastDimSize * bytesOf(outputDesc.dt); if (IS_GENERAL(arch)) { #ifdef _USE_GENERAL - ret = eltwise_general(newOutputDesc.dt, newInput, lastDimSizes, num, lastDimSize, - newOutput, eltwiseDesc.elt_mode); + ret = eltwise_general(outputDesc.dt, ip, lastDimSizes, num, lastDimSize, op, p.mode); #endif #ifdef _USE_NEON } else if (IS_ARM(arch)) { - ret = eltwise_arm(newOutputDesc.dt, newInput, lastDimSizes, num, lastDimSize, newOutput, - eltwiseDesc.elt_mode); + ret = eltwise_arm(outputDesc.dt, ip, lastDimSizes, num, lastDimSize, op, p.mode); #endif #ifdef _USE_X86 } else if (IS_X86(arch)) { - ret = eltwise_x86(newOutputDesc.dt, newInput, lastDimSizes, num, lastDimSize, newOutput, - eltwiseDesc.elt_mode); + ret = eltwise_x86(outputDesc.dt, ip, lastDimSizes, num, lastDimSize, op, p.mode); #endif } } - if (ret == SUCCESS && eltwiseDesc.activation_type != ACTIVATION_NULL) { - ActivationParamSpec p; - p.mode = eltwiseDesc.activation_type; - ret = activation_cpu(outputDesc, output, p, outputDesc, output, arch); + return ret; +} + +// [1, 10, 10] + [1, 10, 10] = [1, 10, 10] +// [1, 10, 1] + [1, 1, 10] = [1, 10, 10] +// [1, 20, 10] + [10] = [1. 20, 10] + [1, 1, 10] = [1, 20, 10] +EE eltwise_cpu(std::vector inputDesc, + std::vector input, + EltwiseParamSpec p, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *output, + Arch arch) +{ + U32 num = inputDesc.size(); + if (num <= 1 || outputDesc.nDims < 1) { + return NOT_MATCH; + } + if (tensorNumElements(outputDesc) == 0) { + return SUCCESS; + } + align_param(inputDesc, input, tmp, outputDesc); + + EE ret = NOT_SUPPORTED; + int scaleId = -1; + TensorDesc scaleDesc; + int axis = scale_axis(inputDesc, outputDesc, &scaleId, &scaleDesc); + if (axis >= 0 && (p.mode == ELTWISE_PROD || p.mode == ELTWISE_SUM)) { + ScaleParamSpec sp; + sp.axis = axis; + if (p.mode == ELTWISE_PROD) { + ret = scale_cpu(scaleDesc, input[scaleId], input[1 - scaleId], nullptr, sp, scaleDesc, + output, arch); + } else { + ret = scale_cpu(scaleDesc, input[scaleId], nullptr, input[1 - scaleId], sp, scaleDesc, + output, arch); + } + } else { + ret = eltwise_kernel(inputDesc, input, p, outputDesc, output, arch); + } + if (ret == SUCCESS && p.activation_type != ACTIVATION_NULL) { + ActivationParamSpec ap; + ap.mode = p.activation_type; + ret = activation_cpu(outputDesc, output, ap, outputDesc, output, arch); } return ret; } diff --git a/compute/tensor/src/cpu/embedding.cpp b/compute/tensor/src/cpu/embedding.cpp index 9946248a..5bd0b156 100644 --- a/compute/tensor/src/cpu/embedding.cpp +++ b/compute/tensor/src/cpu/embedding.cpp @@ -25,8 +25,8 @@ EE embedding_cpu(TensorDesc inputDesc, U8 *outputPtr = (U8 *)output; U32 len = tensorNumElements(inputDesc); U32 elementBytes = bytesOf(weightDesc.dt); - U32 wordEmbeddingCPUBytes = elementBytes * p.num_output; - U32 transposeStride = elementBytes * p.input_dim; + U32 wordEmbeddingCPUBytes = elementBytes * p.num_outputs; + U32 transposeStride = elementBytes * p.num_inputs; EE ret = SUCCESS; for (U32 i = 0; i < len; i++) { U32 wordIndex = 0; @@ -52,14 +52,14 @@ EE embedding_cpu(TensorDesc inputDesc, U8 *dest = outputPtr; if (p.transpose) { U8 *src = weightPtr + wordIndex * elementBytes; - for (U32 j = 0; j < p.num_output; j++) { - memcpy(dest, src, elementBytes); + for (U32 j = 0; j < p.num_outputs; j++) { + UNI_MEMCPY(dest, src, elementBytes); src += transposeStride; dest += elementBytes; } } else { U8 *src = weightPtr + wordIndex * wordEmbeddingCPUBytes; - memcpy(dest, src, wordEmbeddingCPUBytes); + UNI_MEMCPY(dest, src, wordEmbeddingCPUBytes); } outputPtr += wordEmbeddingCPUBytes; } diff --git a/compute/tensor/src/cpu/gat.cpp b/compute/tensor/src/cpu/gat.cpp index 4d475818..f042964c 100644 --- a/compute/tensor/src/cpu/gat.cpp +++ b/compute/tensor/src/cpu/gat.cpp @@ -46,7 +46,7 @@ void preprocess(TensorDesc node_feature_desc, std::vector inputDescs = {outputDesc, outputDesc, outputDesc}; std::vector inputs = {out0, out1, edge_feature}; EltwiseParamSpec eltwiseDesc; - eltwiseDesc.elt_mode = ELTWISE_SUM; + eltwiseDesc.mode = ELTWISE_SUM; eltwiseDesc.activation_type = ACTIVATION_NULL; CHECK_STATUS(eltwise_cpu(inputDescs, inputs, eltwiseDesc, 0, nullptr, outputDesc, output, arch)); @@ -82,7 +82,7 @@ void neighborhood_aware_softmax_yun(TensorDesc inputDesc, } #endif T *out1 = (T *)tmp; - memset(out1, 0, sizeof(T) * num_nodes * num_heads); + UNI_MEMSET(out1, 0, sizeof(T) * num_nodes * num_heads); for (int i = 0; i < num_edges; i++) { int node = nodes1[i]; for (int j = 0; j < num_heads; j++) { @@ -92,13 +92,13 @@ void neighborhood_aware_softmax_yun(TensorDesc inputDesc, for (int i = 0; i < num_edges; i++) { int node = nodes1[i]; - memcpy(output + i * num_heads, out1 + node * num_heads, num_heads * sizeof(T)); + UNI_MEMCPY(output + i * num_heads, out1 + node * num_heads, num_heads * sizeof(T)); } std::vector inputDescs = {inputDesc, inputDesc}; std::vector inputs = {out0, output}; EltwiseParamSpec eltwiseDesc; - eltwiseDesc.elt_mode = ELTWISE_DIV; + eltwiseDesc.mode = ELTWISE_DIV; eltwiseDesc.activation_type = ACTIVATION_NULL; CHECK_STATUS(eltwise_cpu(inputDescs, inputs, eltwiseDesc, 0, nullptr, inputDesc, output, arch)); } @@ -112,7 +112,7 @@ void scatter_atten_score(const int *nodes0, int num_edges, T *out) { - memset(out, 0, sizeof(T) * num_heads * num_nodes * num_nodes); + UNI_MEMSET(out, 0, sizeof(T) * num_heads * num_nodes * num_nodes); for (int j = 0, k = 0; j < num_edges; j++) { int node0 = nodes0[j]; int node1 = nodes1[j]; @@ -142,7 +142,7 @@ EE gat_cpu(TensorDesc node_feature_desc, tmp = (U8 *)out1 + tensorNumBytes(edge_feature_desc); // tmpBytes = tensorNumBytes(edge_feature_desc) * 2 preprocess(node_feature_desc, node_desc, node_features0, nodes0, node_features1, nodes1, - edge_feature, p.activation, tmp, edge_feature_desc, out0, arch); + edge_feature, p.activation_type, tmp, edge_feature_desc, out0, arch); int num_heads = p.num_heads; int num_nodes = node_feature_desc.dims[1]; diff --git a/compute/tensor/src/cpu/gather.cpp b/compute/tensor/src/cpu/gather.cpp index 5dc501af..d0ce6211 100644 --- a/compute/tensor/src/cpu/gather.cpp +++ b/compute/tensor/src/cpu/gather.cpp @@ -24,7 +24,7 @@ inline static void gather(const TensorDesc &dataDesc, { int axis = (p.axis + dataDesc.nDims) % dataDesc.nDims; axis = dataDesc.nDims - 1 - axis; - int outer_loop = 1, k = dataDesc.dims[axis], inner_loop = 1; + int outer_loop = 1, k = dataDesc.dims[axis], loop = tensorNumElements(indexDesc), inner_loop = 1; for (int i = 0; i < axis; i++) { inner_loop *= dataDesc.dims[i]; } @@ -32,11 +32,18 @@ inline static void gather(const TensorDesc &dataDesc, outer_loop *= dataDesc.dims[i]; } int tile_size = inner_loop; - for (int i = 0, dst_index = 0; i < outer_loop; i++) { - for (U32 j = 0; j < tensorNumElements(indexDesc); j++, dst_index += tile_size) { - int src_index = (i * k + index[j]) * tile_size; - memcpy(output + dst_index, data + src_index, tile_size * sizeof(T)); - } +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (int o = 0; o < outer_loop * loop; o++) { + int i = o / loop; + int j = o % loop; + U32 dst_index = o * tile_size; + //for (int i = 0, dst_index = 0; i < outer_loop; i++) + //for (U32 j = 0; j < loop; j++, dst_index += tile_size) + int stable_index = index[j] < 0 ? index[j] + k : index[j]; + int src_index = (i * k + stable_index) * tile_size; + UNI_MEMCPY(output + dst_index, data + src_index, tile_size * sizeof(T)); } } @@ -51,12 +58,14 @@ inline static void gather_elements(const TensorDesc &dataDesc, { int axis = (p.axis + dataDesc.nDims) % dataDesc.nDims; axis = dataDesc.nDims - 1 - axis; - - for (U32 i = 0; i < tensorNumElements(dataDesc); i++) { - std::vector local = calculateLocalIndex(i, dataDesc.dims, dataDesc.nDims); - local[axis] = index[i]; - U32 k = calculateGlobalIndex(local.data(), dataDesc.dims, dataDesc.nDims); - output[i] = data[k]; +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (U32 i = 0; i < tensorNumElements(indexDesc); i++) { + std::vector local = calculateLocalIndex(i, indexDesc.dims, indexDesc.nDims); + local[axis] = index[i] < 0 ? index[i] + dataDesc.dims[axis] : index[i]; + U32 idx = calculateGlobalIndex(local.data(), dataDesc.dims, dataDesc.nDims); + output[i] = data[idx]; } } @@ -82,20 +91,32 @@ inline static void gatherND(const TensorDesc &dataDesc, newDataDesc.dims[axis + 1] = batch_dims_size; newDataDesc.nDims = axis + 1 + 1; - U32 gather_index[16] = {0}; int tile_dims = newDataDesc.nDims - (k + 1); - gather_index[tile_dims + k] = p.batch_dims; U32 tile_size = 1; for (int i = 0; i < tile_dims; i++) { tile_size *= newDataDesc.dims[i]; } - for (int batch_dim = 0, i = 0, dst_index = 0; batch_dim < batch_dims_size; batch_dim++) { - for (int outer_dim = 0; outer_dim < t; outer_dim++, i += k, dst_index += tile_size) { +#ifdef _USE_OPENMP +#pragma omp parallel num_threads(OMP_NUM_THREADS) +#endif + { + U32 gather_index[16] = {0}; + gather_index[tile_dims + k] = p.batch_dims; +#ifdef _USE_OPENMP +#pragma omp for +#endif + for (int o = 0; o < batch_dims_size * t; o++) { + int batch_dim = o / t; + int outer_dim = o % t; + int i = o * k; + int dst_index = o * tile_size; + //for (int batch_dim = 0, i = 0, dst_index = 0; batch_dim < batch_dims_size; batch_dim++) + // for (int outer_dim = 0; outer_dim < t; outer_dim++, i += k, dst_index += tile_size) { for (int j = 0; j < k; j++) { gather_index[tile_dims + k - 1 - j] = index[i + j]; } U32 src_index = calculateGlobalIndex(gather_index, newDataDesc.dims, newDataDesc.nDims); - memcpy(output + dst_index, data + src_index, tile_size * sizeof(T)); + UNI_MEMCPY(output + dst_index, data + src_index, tile_size * sizeof(T)); } } } @@ -137,9 +158,14 @@ EE gather_cpu(TensorDesc dataDesc, EE ret = SUCCESS; switch (dataDesc.dt) { case DT_I32: + case DT_U32: gather_kernel(dataDesc, (const I32 *)data, indexDesc, (const int *)index, p, outputDesc, (I32 *)output); break; + case DT_U8: + gather_kernel(dataDesc, (const U8 *)data, indexDesc, (const int *)index, p, + outputDesc, (U8 *)output); + break; #ifdef _USE_FP32 case DT_F32: gather_kernel(dataDesc, (const F32 *)data, indexDesc, (const int *)index, p, diff --git a/compute/tensor/src/cpu/general/attention.cpp b/compute/tensor/src/cpu/general/attention.cpp index dc12c890..19c47a83 100644 --- a/compute/tensor/src/cpu/general/attention.cpp +++ b/compute/tensor/src/cpu/general/attention.cpp @@ -23,9 +23,9 @@ EE attention( } T minValue = -10000.0; - U32 count = array_sum_template(input, toSequenceLength); - U32 valid = UNI_MIN(count, fromSequenceLength); for (U32 n = 0; n < batch; n++) { + U32 count = array_sum_template(input + n * toSequenceLength, toSequenceLength); + U32 valid = UNI_MIN(count, fromSequenceLength); for (U32 i = 0; i < numHeads; i++) { for (U32 j = 0; j < valid; j++) { for (U32 k = 0; k < toSequenceLength; k++) { diff --git a/compute/tensor/src/cpu/general/attention_mask.cpp b/compute/tensor/src/cpu/general/attention_mask.cpp index c4d45592..6e25f132 100644 --- a/compute/tensor/src/cpu/general/attention_mask.cpp +++ b/compute/tensor/src/cpu/general/attention_mask.cpp @@ -55,7 +55,7 @@ static EE attention_mask(TensorDesc inputDesc, if (start + loops > klen) { loops = UNI_MAX(klen - start, 0); } - memset(&mask[i][start], 0, sizeof(T) * loops); + UNI_MEMSET(&mask[i][start], 0, sizeof(T) * loops); } } I32 loops = tensorNumElements(inputDesc) / qlen / klen; diff --git a/compute/tensor/src/cpu/general/check.cpp b/compute/tensor/src/cpu/general/check.cpp deleted file mode 100644 index ed269423..00000000 --- a/compute/tensor/src/cpu/general/check.cpp +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include "cpu/general/tensor_computing_general.h" -#include "uni.h" - -template -static EE check(TensorDesc inputDescA, - const T *inputA, - TensorDesc inputDescB, - const T *inputB, - CheckMode checkMode, - TensorDesc outputDesc, - I32 *output) -{ - UNUSED(inputDescB); - UNUSED(outputDesc); - - if (nullptr == inputA || nullptr == inputB || nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } - - U32 size = tensorNumElements(inputDescA); - U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1]; - U32 loopInner = size / loopOuter; - - for (U32 i = 0; i < loopOuter; i++) { - U32 count = 0; - for (U32 j = 0; j < loopInner; j++) { - U32 index = i * loopInner + j; - switch (checkMode) { - case CHECK_EQUAL: { - if (inputA[index] == inputB[index]) { - count++; - } - break; - } - case CHECK_GREATEQUAL: { - if (inputA[index] >= inputB[index]) { - count++; - } - break; - } - case CHECK_GREAT: { - if (inputA[index] > inputB[index]) { - count++; - } - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } - } - - if (count == loopInner) { - output[i] = 1; - } else { - output[i] = 0; - } - } - return SUCCESS; -} - -EE check_general(TensorDesc inputDescA, - const void *inputA, - TensorDesc inputDescB, - const void *inputB, - CheckParamSpec p, - TensorDesc outputDesc, - void *output) -{ - DataType idt = inputDescA.dt; - EE ret = SUCCESS; - switch (idt) { -#ifdef _USE_FP16 - case DT_F16: { - ret = check(inputDescA, (const F16 *)inputA, inputDescB, (const F16 *)inputB, - p.check_mode, outputDesc, (I32 *)output); - break; - } -#endif -#ifdef _USE_FP32 - case DT_F32: { - ret = check(inputDescA, (const F32 *)inputA, inputDescB, (const F32 *)inputB, - p.check_mode, outputDesc, (I32 *)output); - break; - } -#endif - case DT_U32: { - ret = check(inputDescA, (const U32 *)inputA, inputDescB, (const U32 *)inputB, - p.check_mode, outputDesc, (I32 *)output); - break; - } - case DT_I32: { - ret = check(inputDescA, (const I32 *)inputA, inputDescB, (const I32 *)inputB, - p.check_mode, outputDesc, (I32 *)output); - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - - return ret; -} diff --git a/compute/tensor/src/cpu/general/convolution.cpp b/compute/tensor/src/cpu/general/convolution.cpp index 3601897f..26ef62a2 100644 --- a/compute/tensor/src/cpu/general/convolution.cpp +++ b/compute/tensor/src/cpu/general/convolution.cpp @@ -49,9 +49,9 @@ inline EE convolution(TensorDesc inputDesc, U32 strideT = convParamSpec.stride_t; U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingB = convParamSpec.padding_before; - U32 paddingT = convParamSpec.padding_top; - U32 paddingL = convParamSpec.padding_left; + U32 paddingB = convParamSpec.pad_before; + U32 paddingT = convParamSpec.pad_top; + U32 paddingL = convParamSpec.pad_left; U32 dilateT = convParamSpec.dilatedRate_t; U32 dilateH = convParamSpec.dilatedRate_h; U32 dilateW = convParamSpec.dilatedRate_w; @@ -202,9 +202,9 @@ EE convolution_general(TensorDesc inputDesc, UNUSED(biasDesc); if (eltwiseInput == nullptr) { - memset(output, 0, tensorNumBytes(outputDesc)); + UNI_MEMSET(output, 0, tensorNumBytes(outputDesc)); } else { - memcpy(output, eltwiseInput, tensorNumBytes(outputDesc)); + UNI_MEMCPY(output, eltwiseInput, tensorNumBytes(outputDesc)); } EE ret = NOT_SUPPORTED; diff --git a/compute/tensor/src/cpu/general/cumsum.cpp b/compute/tensor/src/cpu/general/cumsum.cpp new file mode 100644 index 00000000..41a983a5 --- /dev/null +++ b/compute/tensor/src/cpu/general/cumsum.cpp @@ -0,0 +1,94 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/general/tensor_computing_general.h" + +template +static void cumsum( + TensorDesc inputDesc, const T *input, CumSumParamSpec p, TensorDesc outputDesc, T *output) +{ + int axis = (p.axis + inputDesc.nDims) % inputDesc.nDims; + axis = inputDesc.nDims - 1 - axis; + int loopOuter = 1, loopInner = 1; + for (int i = 0; i < axis; i++) { + loopInner *= inputDesc.dims[i]; + } + int loops = inputDesc.dims[axis]; + for (U32 i = axis + 1; i < inputDesc.nDims; i++) { + loopOuter *= inputDesc.dims[i]; + } + int id, id1; + for (int i = 0; i < loopOuter; i++) { + for (int j = 0; j < loopInner; j++) { + if (p.reverse) { + id = (i * loops + loops - 1) * loopInner + j; + if (p.exclusive) { + output[id] = 0; + id1 = id; + id -= loopInner; + } else { + output[id] = input[id]; + id1 = id - loopInner; + id = id1; + } + for (int k = loops - 2; k >= 0; k--, id -= loopInner, id1 -= loopInner) { + output[id] = output[id + loopInner] + input[id1]; + } + } else { + id = i * loops * loopInner + j; + if (p.exclusive) { + output[id] = 0; + id1 = id; + id += loopInner; + } else { + output[id] = input[id]; + id1 = id + loopInner; + id = id1; + } + for (int k = 1; k < loops; k++, id += loopInner, id1 += loopInner) { + output[id] = output[id - loopInner] + input[id1]; + } + } + } + } +} + +EE cumsum_general( + TensorDesc inputDesc, const void *input, CumSumParamSpec p, TensorDesc outputDesc, void *output) +{ + DataType idt = inputDesc.dt; + EE ret = SUCCESS; + switch (idt) { +#ifdef _USE_FP16 + case DT_F16: { + cumsum(inputDesc, (const F16 *)input, p, outputDesc, (F16 *)output); + break; + } +#endif +#ifdef _USE_FP32 + case DT_F32: { + cumsum(inputDesc, (const F32 *)input, p, outputDesc, (F32 *)output); + break; + } +#endif + case DT_I32: { + cumsum(inputDesc, (const I32 *)input, p, outputDesc, (I32 *)output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + + return ret; +} diff --git a/compute/tensor/src/cpu/general/deconvolution.cpp b/compute/tensor/src/cpu/general/deconvolution.cpp index e46953c9..5a983def 100644 --- a/compute/tensor/src/cpu/general/deconvolution.cpp +++ b/compute/tensor/src/cpu/general/deconvolution.cpp @@ -36,12 +36,12 @@ inline EE deconvolution(TensorDesc inputDesc, U32 group = convParamSpec.group; U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingL = convParamSpec.padding_left; + U32 paddingT = convParamSpec.pad_top; + U32 paddingL = convParamSpec.pad_left; U32 ocGroupSize = oc / group; // initialize outputs to 0 - memset(outArray, 0, tensorNumBytes(outputDesc)); + UNI_MEMSET(outArray, 0, tensorNumBytes(outputDesc)); U32 ic8 = ic / 8; U32 oc8 = oc / 8; for (U32 n = 0; n < in; n++) { diff --git a/compute/tensor/src/cpu/general/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/general/depthwise_pointwise_convolution.cpp index 739340b6..28df05ea 100644 --- a/compute/tensor/src/cpu/general/depthwise_pointwise_convolution.cpp +++ b/compute/tensor/src/cpu/general/depthwise_pointwise_convolution.cpp @@ -66,8 +66,8 @@ inline EE depthwise_pointwise_convolution(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingL = convParamSpec.padding_left; + U32 paddingT = convParamSpec.pad_top; + U32 paddingL = convParamSpec.pad_left; U32 dilatedRateH = convParamSpec.dilatedRate_h; U32 dilatedRateW = convParamSpec.dilatedRate_w; @@ -80,8 +80,24 @@ inline EE depthwise_pointwise_convolution(TensorDesc inputDesc, } else { pwArray = outArray; } - U32 ic8 = ic / 8; - U32 oc8 = oc / 8; + U32 ic8 = ic; + U32 oc8 = oc; + U32 icx = 1; + U32 ocx = 1; + if (idf == DF_NCHWC16) { + icx = 16; + ic8 /= 16; + } else if (idf == DF_NCHWC8) { + icx = 8; + ic8 /= 8; + } + if (odf == DF_NCHWC16) { + ocx = 16; + oc8 /= 16; + } else if (odf == DF_NCHWC8) { + ocx = 8; + oc8 /= 8; + } for (U32 n = 0, pw_off = 0; n < in; n++) { // dw conv for (U32 c = 0; c < ic; c++) { @@ -94,11 +110,12 @@ inline EE depthwise_pointwise_convolution(TensorDesc inputDesc, I32 iw_idx = w * strideW - paddingL + fw_idx * dilatedRateW; if (ih_idx >= 0 && ih_idx < (I32)ih && iw_idx >= 0 && iw_idx < (I32)iw) { U32 i_off; - if (idf != DF_NCHWC8) { - i_off = ((n * ic + c) * ih + ih_idx) * iw + iw_idx; + if (idf == DF_NCHWC8 || idf == DF_NCHWC16) { + i_off = (((n * ic8 + (c / icx)) * ih + ih_idx) * iw + iw_idx) * + icx + + c % icx; } else { - i_off = (((n * ic8 + (c / 8)) * ih + ih_idx) * iw + iw_idx) * 8 + - c % 8; + i_off = ((n * ic + c) * ih + ih_idx) * iw + iw_idx; } value += inArray[i_off] * dwFilterArray[c * fh * fw + fh_idx * fw + fw_idx]; @@ -108,10 +125,10 @@ inline EE depthwise_pointwise_convolution(TensorDesc inputDesc, CHECK_STATUS( activation_template(depthwiseActivationParamSpec, value, &value)); - if (fuseDepthwisePointwise || odf != DF_NCHWC8) { + if (fuseDepthwisePointwise || (odf != DF_NCHWC8 && odf != DF_NCHWC16)) { pwArray[pw_off] = value; } else { - pwArray[(((n * ic8 + (c / 8)) * oh + h) * ow + w) * 8 + c % 8] = value; + pwArray[(((n * ic8 + (c / ocx)) * oh + h) * ow + w) * ocx + c % ocx] = value; } } } @@ -128,10 +145,10 @@ inline EE depthwise_pointwise_convolution(TensorDesc inputDesc, CHECK_STATUS( activation_template(pointwiseActivationParamSpec, value, &value)); U32 o_off; - if (odf != DF_NCHWC8) { - o_off = (n * oc + o) * oh * ow + hw; + if (odf == DF_NCHWC8 || odf == DF_NCHWC16) { + o_off = ((n * oc8 + (o / ocx)) * oh * ow + hw) * ocx + o % ocx; } else { - o_off = ((n * oc8 + (o / 8)) * oh * ow + hw) * 8 + o % 8; + o_off = (n * oc + o) * oh * ow + hw; } outArray[o_off] += value; } @@ -161,9 +178,9 @@ EE depthwise_pointwise_convolution_general(TensorDesc inputDesc, ActivationParamSpec pointwiseActivationParamSpec) { if (eltwiseInput == nullptr) { - memset(output, 0, tensorNumBytes(outputDesc)); + UNI_MEMSET(output, 0, tensorNumBytes(outputDesc)); } else { - memcpy(output, eltwiseInput, tensorNumBytes(outputDesc)); + UNI_MEMCPY(output, eltwiseInput, tensorNumBytes(outputDesc)); } EE ret = SUCCESS; switch (inputDesc.dt) { diff --git a/compute/tensor/src/cpu/general/general_functions.h b/compute/tensor/src/cpu/general/general_functions.h index bab0a7f4..1790f222 100644 --- a/compute/tensor/src/cpu/general/general_functions.h +++ b/compute/tensor/src/cpu/general/general_functions.h @@ -278,6 +278,9 @@ inline EE array_minmax_value_general(DataType dt, const void *data, I32 len, int case DT_I32: ret = array_minmax_value_template((const I32 *)data, len, mode, result); break; + case DT_U32: + ret = array_minmax_value_template((const U32 *)data, len, mode, result); + break; default: ret = NOT_SUPPORTED; break; diff --git a/compute/tensor/src/cpu/general/normalization.cpp b/compute/tensor/src/cpu/general/normalization.cpp index 793ebd7b..fdc824f9 100644 --- a/compute/tensor/src/cpu/general/normalization.cpp +++ b/compute/tensor/src/cpu/general/normalization.cpp @@ -16,11 +16,12 @@ #include "cpu/general/general_functions.h" #include "cpu/general/tensor_computing_general.h" +static float eps = 1e-6; + template -inline EE array_norm_scale_template( +inline static EE array_norm_scale_template( T *input, T *output, I32 len, F32 mean, F32 var, T *alpha, T *beta) { - F32 eps = 1e-6; F32 std_value = sqrt(var + eps); for (I32 i = 0; i < len; i++) { output[i] = alpha[i] * (input[i] - mean) / std_value + beta[i]; @@ -29,12 +30,9 @@ inline EE array_norm_scale_template( } template -inline EE layer_normalization_template( +static EE layer_normalization_nhwc( TensorDesc inputDesc, T *input, T *alpha, T *beta, TensorDesc outputDesc, T *output) { - if (nullptr == input || nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } if (inputDesc.dt != outputDesc.dt || inputDesc.df != outputDesc.df) { CHECK_STATUS(NOT_MATCH); } @@ -51,32 +49,104 @@ inline EE layer_normalization_template( array_norm_scale_template( current_input, current_output, size_inner, mean, var, alpha, beta); } + return SUCCESS; +} + +template +static EE layer_normalization_nchwc8( + TensorDesc inputDesc, T *input, T *alpha, T *beta, TensorDesc outputDesc, T *output) +{ + int n = inputDesc.dims[inputDesc.nDims - 1]; + int c = inputDesc.dims[inputDesc.nDims - 2]; + int hw = 1; + for (unsigned int i = 0; i < inputDesc.nDims - 2; i++) { + hw *= inputDesc.dims[i]; + } + int c8 = c / 8; + for (int i = 0; i < n; i++) { + for (int j = 0; j < hw; j++) { + F32 sum = 0; + for (int k = 0; k < c8; k++) { + int id = ((i * c8 + k) * hw + j) * 8; + for (int a = id; a < id + 8; a++) { + sum += input[a]; + } + } + F32 mean = sum / c; + + sum = 0; + for (int k = 0; k < c8; k++) { + int id = ((i * c8 + k) * hw + j) * 8; + for (int a = id; a < id + 8; a++) { + F32 tmp = input[a] - mean; + sum += tmp * tmp; + } + } + F32 var = sum / c; + F32 std_value = sqrt(var + eps); + for (int k = 0, kk = 0; k < c8; k++) { + int id = ((i * c8 + k) * hw + j) * 8; + for (int a = id; a < id + 8; a++, kk++) { + output[a] = alpha[kk] * ((input[a] - mean) / std_value) + beta[kk]; + } + } + } + } return SUCCESS; } -EE layer_normalization_general( - TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output) +template +static EE layer_normalization_template(TensorDesc inputDesc, + T *input, + LayerNormParamSpec p, + T *alpha, + T *beta, + TensorDesc outputDesc, + T *output) +{ + if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + EE ret = NOT_SUPPORTED; + if (inputDesc.df == DF_NCHWC8) { + if (p.axis == 1) { + ret = layer_normalization_nchwc8(inputDesc, input, alpha, beta, outputDesc, output); + } + } else { + if (p.axis == -1) { + ret = layer_normalization_nhwc(inputDesc, input, alpha, beta, outputDesc, output); + } + } + return ret; +} + +EE layer_normalization_general(TensorDesc inputDesc, + void *input, + LayerNormParamSpec p, + void *alpha, + void *beta, + TensorDesc outputDesc, + void *output) { - DataType idt = inputDesc.dt; - EE ret = SUCCESS; - switch (idt) { + EE ret = NOT_SUPPORTED; + switch (inputDesc.dt) { #ifdef _USE_FP32 case DT_F32: { ret = layer_normalization_template( - inputDesc, (F32 *)input, (F32 *)alpha, (F32 *)beta, outputDesc, (F32 *)output); + inputDesc, (F32 *)input, p, (F32 *)alpha, (F32 *)beta, outputDesc, (F32 *)output); break; } #endif #ifdef _USE_FP16 case DT_F16: { ret = layer_normalization_template( - inputDesc, (F16 *)input, (F16 *)alpha, (F16 *)beta, outputDesc, (F16 *)output); + inputDesc, (F16 *)input, p, (F16 *)alpha, (F16 *)beta, outputDesc, (F16 *)output); break; } #endif default: - ret = NOT_SUPPORTED; break; } return ret; diff --git a/compute/tensor/src/cpu/general/padding.cpp b/compute/tensor/src/cpu/general/padding.cpp index 202bcb52..892289d1 100644 --- a/compute/tensor/src/cpu/general/padding.cpp +++ b/compute/tensor/src/cpu/general/padding.cpp @@ -39,33 +39,33 @@ EE padding_general(TensorDesc inputDesc, (const U8 *)input + (((n * ic + c) * ih + h) * iw) * alignSize * bytesOf(idt); U8 *outPtr = (U8 *)output + (((n * oc + c) * oh + (padParamSpec.top + h)) * ow) * alignSize * bytesOf(odt); - if (padParamSpec.pad_mode == Pad_Constant) { - memset(outPtr, 0, padParamSpec.left * alignSize * bytesOf(odt)); + if (padParamSpec.pad_mode == PAD_CONSTANT) { + UNI_MEMSET(outPtr, 0, padParamSpec.left * alignSize * bytesOf(odt)); outPtr += padParamSpec.left * alignSize * bytesOf(odt); - memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); + UNI_MEMCPY(outPtr, inPtr, iw * alignSize * bytesOf(idt)); outPtr += iw * alignSize * bytesOf(odt); - memset(outPtr, 0, padParamSpec.right * alignSize * bytesOf(odt)); + UNI_MEMSET(outPtr, 0, padParamSpec.right * alignSize * bytesOf(odt)); } else { for (U32 w = 0; w < padParamSpec.left; w++) { U32 index = 0; - if (padParamSpec.pad_mode == Pad_Reflect) { + if (padParamSpec.pad_mode == PAD_REFLECT) { index = (padParamSpec.left - w) * alignSize * bytesOf(idt); - } else if (padParamSpec.pad_mode == Pad_Symmetric) { + } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) { index = (padParamSpec.left - w - 1) * alignSize * bytesOf(idt); } - memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt)); + UNI_MEMCPY(outPtr, inPtr + index, alignSize * bytesOf(idt)); outPtr += alignSize * bytesOf(idt); } - memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); + UNI_MEMCPY(outPtr, inPtr, iw * alignSize * bytesOf(idt)); outPtr += iw * alignSize * bytesOf(odt); for (U32 w = 0; w < padParamSpec.right; w++) { U32 index = (iw - 1) * alignSize * bytesOf(idt); - if (padParamSpec.pad_mode == Pad_Reflect) { + if (padParamSpec.pad_mode == PAD_REFLECT) { index = (iw - w - 2) * alignSize * bytesOf(idt); - } else if (padParamSpec.pad_mode == Pad_Symmetric) { + } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) { index = (iw - w - 1) * alignSize * bytesOf(idt); } - memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt)); + UNI_MEMCPY(outPtr, inPtr + index, alignSize * bytesOf(idt)); outPtr += alignSize * bytesOf(idt); } } @@ -73,20 +73,20 @@ EE padding_general(TensorDesc inputDesc, U8 *outPtr = (U8 *)output + (((n * oc + c) * oh) * ow) * alignSize * bytesOf(odt); for (U32 h = 0; h < padParamSpec.top; h++) { U32 index = h * ow * alignSize * bytesOf(odt); - if (padParamSpec.pad_mode == Pad_Constant) { - memset(outPtr + index, 0, ow * alignSize * bytesOf(odt)); - } else if (padParamSpec.pad_mode == Pad_Edge) { - memcpy(outPtr + index, + if (padParamSpec.pad_mode == PAD_CONSTANT) { + UNI_MEMSET(outPtr + index, 0, ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == PAD_EDGE) { + UNI_MEMCPY(outPtr + index, outPtr + (padParamSpec.top * ow * alignSize * bytesOf(odt)), ow * alignSize * bytesOf(odt)); - } else if (padParamSpec.pad_mode == Pad_Reflect) { - memcpy(outPtr + index, + } else if (padParamSpec.pad_mode == PAD_REFLECT) { + UNI_MEMCPY(outPtr + index, outPtr + ((padParamSpec.top + padParamSpec.top - h) * ow * alignSize * bytesOf(odt)), ow * alignSize * bytesOf(odt)); - } else if (padParamSpec.pad_mode == Pad_Symmetric) { - memcpy(outPtr + index, + } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) { + UNI_MEMCPY(outPtr + index, outPtr + ((padParamSpec.top + padParamSpec.top - h - 1) * ow * alignSize * bytesOf(odt)), @@ -97,21 +97,21 @@ EE padding_general(TensorDesc inputDesc, } for (U32 h = 0; h < padParamSpec.bottom; h++) { U32 index = (padParamSpec.top + ih + h) * ow * alignSize * bytesOf(odt); - if (padParamSpec.pad_mode == Pad_Constant) { - memset(outPtr + index, 0, ow * alignSize * bytesOf(odt)); - } else if (padParamSpec.pad_mode == Pad_Edge) { - memcpy(outPtr + index, + if (padParamSpec.pad_mode == PAD_CONSTANT) { + UNI_MEMSET(outPtr + index, 0, ow * alignSize * bytesOf(odt)); + } else if (padParamSpec.pad_mode == PAD_EDGE) { + UNI_MEMCPY(outPtr + index, outPtr + ((padParamSpec.top + ih - 1) * ow * alignSize * bytesOf(odt)), ow * alignSize * bytesOf(odt)); - } else if (padParamSpec.pad_mode == Pad_Reflect) { - // memcpy(outPtr+index, outPtr+((padParamSpec.top+ih-2-h)*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt)); - memcpy(outPtr + index, + } else if (padParamSpec.pad_mode == PAD_REFLECT) { + // UNI_MEMCPY(outPtr+index, outPtr+((padParamSpec.top+ih-2-h)*ow*alignSize*bytesOf(odt)), ow*alignSize*bytesOf(odt)); + UNI_MEMCPY(outPtr + index, outPtr + ((padParamSpec.top + ih - 1 - padParamSpec.bottom + h) * ow * alignSize * bytesOf(odt)), ow * alignSize * bytesOf(odt)); - } else if (padParamSpec.pad_mode == Pad_Symmetric) { - memcpy(outPtr + index, + } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) { + UNI_MEMCPY(outPtr + index, outPtr + ((padParamSpec.top + ih - 1 - h) * ow * alignSize * bytesOf(odt)), ow * alignSize * bytesOf(odt)); } else { diff --git a/compute/tensor/src/cpu/general/pooling.cpp b/compute/tensor/src/cpu/general/pooling.cpp index 54dcf007..2dc836e6 100644 --- a/compute/tensor/src/cpu/general/pooling.cpp +++ b/compute/tensor/src/cpu/general/pooling.cpp @@ -16,9 +16,10 @@ #include "cpu/general/tensor_computing_general.h" -template -EE pooling(T *input, - T *output, +template +EE pooling(DataType idt, + T1 *input, + T1 *output, I32 in, I32 ic, I32 it, @@ -27,25 +28,26 @@ EE pooling(T *input, I32 ot, I32 oh, I32 ow, - I32 stride_t, - I32 stride_h, - I32 stride_w, - I32 padding_before, - I32 padding_after, - I32 padding_top, - I32 padding_bottom, - I32 padding_left, - I32 padding_right, - I32 kernel_t, - I32 kernel_h, - I32 kernel_w, - PoolingMode pm, - RoundMode rm, + PoolingParamSpec p, I32 alignSize, - F32 minValue) + F32 minValue, + void *scale) { CHECK_REQUIREMENT(ic % alignSize == 0); ic = ic / alignSize; + float poolSize = p.kernel_t * p.kernel_h * p.kernel_w; + +#ifdef _USE_INT8 + F32 *inputScale = (F32 *)scale; + F32 *outputScale = inputScale + 1; + I32 shift = 65536; + I32 factor = shift / poolSize; + if (p.mode == POOLING_MAX) { + *outputScale = *inputScale; + } else { + *outputScale = *inputScale * factor * poolSize / (F32)shift; + } +#endif EE ret = SUCCESS; for (I32 n = 0; n < in; n++) { @@ -54,26 +56,29 @@ EE pooling(T *input, for (I32 t = 0; t < ot; t++) { for (I32 h = 0; h < oh; h++) { for (I32 w = 0; w < ow; w++) { - int tstart = t * stride_t - padding_before; - int hstart = h * stride_h - padding_top; - int wstart = w * stride_w - padding_left; - int tend = tstart + kernel_t; - int hend = hstart + kernel_h; - int wend = wstart + kernel_w; + int tstart = t * p.stride_t - p.pad_before; + int hstart = h * p.stride_h - p.pad_top; + int wstart = w * p.stride_w - p.pad_left; + int tend = tstart + p.kernel_t; + int hend = hstart + p.kernel_h; + int wend = wstart + p.kernel_w; tstart = UNI_MAX(tstart, 0); hstart = UNI_MAX(hstart, 0); wstart = UNI_MAX(wstart, 0); tend = UNI_MIN(tend, it); hend = UNI_MIN(hend, ih); wend = UNI_MIN(wend, iw); - float poolSize = (tend - tstart) * (hend - hstart) * (wend - wstart); - T value; - switch (pm) { + if (!p.count_include_pad) { + poolSize = (tend - tstart) * (hend - hstart) * (wend - wstart); + } + T1 maxVal = 0; + T2 meanVal = 0; + switch (p.mode) { case POOLING_MAX: - value = minValue; + maxVal = minValue; break; case POOLING_MEAN: - value = 0; + meanVal = 0; break; default: return NOT_SUPPORTED; @@ -86,13 +91,13 @@ EE pooling(T *input, U32 in_off = ((((n * ic + c) * it + z) * ih + x) * iw + y) * alignSize + j; - switch (pm) { + switch (p.mode) { case POOLING_MAX: - value = (value > input[in_off]) ? value - : input[in_off]; + maxVal = (maxVal > input[in_off]) ? maxVal + : input[in_off]; break; case POOLING_MEAN: - value += input[in_off]; + meanVal += input[in_off]; break; default: ret = NOT_SUPPORTED; @@ -101,18 +106,25 @@ EE pooling(T *input, } } } - switch (pm) { + switch (p.mode) { case POOLING_MAX: + output[out_off] = maxVal; break; case POOLING_MEAN: - value = value / poolSize; + if (idt == DT_I8 || idt == DT_U8_Q) { +#ifdef _USE_INT8 + I32 factor = shift / + ((tend - tstart) * (hend - hstart) * (wend - wstart)); + output[out_off] = ((I32)meanVal * factor) >> 16; +#endif + } else { + output[out_off] = meanVal / poolSize; + } break; default: ret = NOT_SUPPORTED; break; } - - output[out_off] = value; } } } @@ -122,8 +134,12 @@ EE pooling(T *input, return ret; } -EE pooling_general( - TensorDesc inputDesc, const void *input, PoolingParamSpec p, TensorDesc outputDesc, void *output) +EE pooling_general(TensorDesc inputDesc, + const void *input, + PoolingParamSpec p, + void *scale, + TensorDesc outputDesc, + void *output) { if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); @@ -143,25 +159,35 @@ EE pooling_general( return NOT_SUPPORTED; } - if (in != on || ic != oc || idf != DF_NCHWC8 || odf != idf) { + if (in != on || ic != oc || (idf != DF_NCHWC8 && idf != DF_NCHWC16) || odf != idf) { CHECK_STATUS(NOT_MATCH); } + I32 alignSize = 8; + if (idf == DF_NCHWC16) { + alignSize = 16; + } EE ret = SUCCESS; switch (idt) { #ifdef _USE_FP32 case DT_F32: - ret = pooling((F32 *)input, (F32 *)output, in, ic, it, ih, iw, ot, oh, ow, p.stride_t, - p.stride_h, p.stride_w, p.padding_before, p.padding_after, p.padding_top, - p.padding_bottom, p.padding_left, p.padding_right, p.kernel_t, p.kernel_h, - p.kernel_w, p.mode, p.rm, 8, -FLT_MAX); + ret = pooling(idt, (F32 *)input, (F32 *)output, in, ic, it, ih, iw, ot, oh, + ow, p, alignSize, -FLT_MAX, scale); break; #endif #ifdef _USE_FP16 case DT_F16: - ret = pooling((F16 *)input, (F16 *)output, in, ic, it, ih, iw, ot, oh, ow, p.stride_t, - p.stride_h, p.stride_w, p.padding_before, p.padding_after, p.padding_top, - p.padding_bottom, p.padding_left, p.padding_right, p.kernel_t, p.kernel_h, - p.kernel_w, p.mode, p.rm, 8, -UNI_F16_MAX); + ret = pooling(idt, (F16 *)input, (F16 *)output, in, ic, it, ih, iw, ot, oh, + ow, p, alignSize, -UNI_F16_MAX, scale); + break; +#endif +#ifdef _USE_INT8 + case DT_I8: + ret = pooling(idt, (INT8 *)input, (INT8 *)output, in, ic, it, ih, iw, ot, oh, + ow, p, alignSize, -UNI_F16_MAX, scale); + break; + case DT_U8_Q: + ret = pooling(idt, (UINT8 *)input, (UINT8 *)output, in, ic, it, ih, iw, ot, + oh, ow, p, alignSize, -UNI_F16_MAX, scale); break; #endif default: diff --git a/compute/tensor/src/cpu/general/pooling_bp.cpp b/compute/tensor/src/cpu/general/pooling_bp.cpp index a4acc4f4..b178dab7 100644 --- a/compute/tensor/src/cpu/general/pooling_bp.cpp +++ b/compute/tensor/src/cpu/general/pooling_bp.cpp @@ -17,41 +17,28 @@ #include "cpu/general/tensor_computing_general.h" template -EE pooling_bp(T *input, - T *output, - U32 in, - U32 ic, - U32 ih, - U32 iw, - U32 strideH, - U32 strideW, - U32 paddingT, - U32 paddingL, - U32 kernelH, - U32 kernelW, - PoolingMode pm, - U32 oh, - U32 ow, - U32 alignSize) +EE pooling_bp( + T *input, T *output, U32 in, U32 ic, U32 ih, U32 iw, U32 oh, U32 ow, PoolingParamSpec p, U32 alignSize) { - UNUSED(pm); CHECK_REQUIREMENT(ic % alignSize == 0); ic = ic / alignSize; - + float poolSize = p.kernel_h * p.kernel_w; for (U32 n = 0; n < in; n++) { for (U32 c = 0; c < ic; c++) { for (U32 j = 0; j < alignSize; j++) { for (I32 h = 0; h < (I32)ih; h++) { for (I32 w = 0; w < (I32)iw; w++) { - int hstart = int(h * strideH - paddingT); - int wstart = int(w * strideW - paddingL); - int hend = hstart + kernelH; - int wend = wstart + kernelW; + int hstart = int(h * p.stride_h - p.pad_top); + int wstart = int(w * p.stride_w - p.pad_left); + int hend = hstart + p.kernel_h; + int wend = wstart + p.kernel_w; hstart = (hstart < 0) ? 0 : hstart; wstart = (wstart < 0) ? 0 : wstart; hend = (hend > (int)oh) ? oh : hend; wend = (wend > (int)ow) ? ow : wend; - float poolSize = (hend - hstart) * (wend - wstart); + if (!p.count_include_pad) { + poolSize = (hend - hstart) * (wend - wstart); + } for (int x = hstart; x < hend; x++) { for (int y = wstart; y < wend; y++) { U32 in_off = ((((n * ic + c) * ih) + h) * iw + w) * alignSize + j; @@ -67,11 +54,8 @@ EE pooling_bp(T *input, return SUCCESS; } -EE pooling_bp_general(TensorDesc inputDesc, - const void *input, - PoolingParamSpec poolingParamSpec, - TensorDesc outputDesc, - void *output) +EE pooling_bp_general( + TensorDesc inputDesc, const void *input, PoolingParamSpec p, TensorDesc outputDesc, void *output) { if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); @@ -88,24 +72,15 @@ EE pooling_bp_general(TensorDesc inputDesc, if (idf != DF_NCHWC8 || odf != idf) { CHECK_STATUS(NOT_MATCH); } - - U32 strideH = poolingParamSpec.stride_h; - U32 strideW = poolingParamSpec.stride_w; - U32 paddingT = poolingParamSpec.padding_top; - U32 paddingL = poolingParamSpec.padding_left; - U32 kernelSizeH = poolingParamSpec.kernel_h; - U32 kernelSizeW = poolingParamSpec.kernel_w; - EE ret = SUCCESS; switch (idt) { #ifdef _USE_FP32 case DT_F32: - ret = pooling_bp((F32 *)input, (F32 *)output, in, ic, ih, iw, strideH, strideW, - paddingT, paddingL, kernelSizeH, kernelSizeW, poolingParamSpec.mode, oh, ow, 8); + ret = pooling_bp((F32 *)input, (F32 *)output, in, ic, ih, iw, oh, ow, p, 8); break; #endif default: ret = NOT_SUPPORTED; } return ret; -} \ No newline at end of file +} diff --git a/compute/tensor/src/cpu/general/rnn.cpp b/compute/tensor/src/cpu/general/rnn.cpp index 23e16d47..b45c648f 100644 --- a/compute/tensor/src/cpu/general/rnn.cpp +++ b/compute/tensor/src/cpu/general/rnn.cpp @@ -11,7 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include #include #include "cpu/general/tensor_computing_general.h" @@ -69,14 +68,12 @@ static EE lstmcell(TensorDesc xDesc, U32 batch = in; U32 xDim = ix; - U32 hDim = rnnParamSpec.numOutput; - I32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection - : rnnParamSpec.numOutput; - int num1 = rnnParamSpec.biDirection ? 2 : 1; + U32 hDim = rnnParamSpec.num_outputs; + I32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection + : rnnParamSpec.num_outputs; + int num1 = rnnParamSpec.bi_direction ? 2 : 1; U32 steps = batchStrideH / hDim / num1; - F32 forgetBias = rnnParamSpec.forgetBias; - ActivationMode activationMode = rnnParamSpec.activationMode; - if (activationMode != ACTIVATION_TANH) { + if (rnnParamSpec.activation_type != ACTIVATION_TANH) { CHECK_STATUS(NOT_SUPPORTED); } @@ -100,8 +97,8 @@ static EE lstmcell(TensorDesc xDesc, for (U32 m = 0; m < batch; m++) { T *lastBatchH = lastHArray + m * lastHStride; if (xDim > 0) { - memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(T)); - memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(T)); + UNI_MEMCPY(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(T)); + UNI_MEMCPY(xhArray + xDim, lastBatchH, hDim * sizeof(T)); } else { intermediateH = tmpArray; xhArray = lastBatchH; @@ -109,7 +106,7 @@ static EE lstmcell(TensorDesc xDesc, // MVM const T *mBias = (const T *)bias[0] + m * steps * column * 4; - memcpy(intermediateH, mBias, column * 4 * sizeof(T)); + UNI_MEMCPY(intermediateH, mBias, column * 4 * sizeof(T)); mvm_nkn32_template(fn / 32, fk, (const T *)filter[0], xhArray, intermediateH); T *out_i = intermediateH; @@ -121,12 +118,12 @@ static EE lstmcell(TensorDesc xDesc, T *currentBatchH = currentHArray + m * currentHStride; T *currentOutput = outputArray + m * batchStrideH; T *tmpState, *tmpHH, *tmpH; - if (rnnParamSpec.zoneoutCell == 0) { + if (rnnParamSpec.zoneout_cell == 0) { tmpState = currentBatchState; } else { tmpState = out_i; } - if (rnnParamSpec.numProjection > 0) { + if (rnnParamSpec.num_projection > 0) { tmpHH = out_g; tmpH = currentOutput; } else { @@ -138,7 +135,7 @@ static EE lstmcell(TensorDesc xDesc, F32 C_s = lastBatchState[h]; F32 I_s = 1.0 / (1.0 + exp(-out_i[h])); F32 G_s = tanh(out_g[h]); - F32 F_s = 1.0 / (1.0 + exp(-(out_f[h] + forgetBias))); + F32 F_s = 1.0 / (1.0 + exp(-(out_f[h] + rnnParamSpec.forget_bias))); F32 O_s = 1.0 / (1.0 + exp(-out_o[h])); C_s = C_s * F_s + I_s * G_s; F32 value = O_s * tanh(C_s); @@ -146,28 +143,28 @@ static EE lstmcell(TensorDesc xDesc, tmpHH[h] = value; } - if (rnnParamSpec.zoneoutCell != 0) { - array_scale_template(tmpState, tmpState, column, 1 - rnnParamSpec.zoneoutCell, 0); + if (rnnParamSpec.zoneout_cell != 0) { + array_scale_template(tmpState, tmpState, column, 1 - rnnParamSpec.zoneout_cell, 0); array_scale_template( - lastBatchState, lastBatchState, column, rnnParamSpec.zoneoutCell, 0); + lastBatchState, lastBatchState, column, rnnParamSpec.zoneout_cell, 0); array_add_template(tmpState, lastBatchState, currentBatchState, column); } - if (rnnParamSpec.numProjection > 0) { - memset(tmpH, 0, sizeof(T) * hDim); + if (rnnParamSpec.num_projection > 0) { + UNI_MEMSET(tmpH, 0, sizeof(T) * hDim); mvm_nkn32_template( - hDim / 32, rnnParamSpec.numProjection, (const T *)filter[1], tmpHH, tmpH); + hDim / 32, rnnParamSpec.num_projection, (const T *)filter[1], tmpHH, tmpH); } - if (rnnParamSpec.zoneoutOutput != 0) { - if (rnnParamSpec.numProjection > 0) { - array_scale_template(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + if (rnnParamSpec.zoneout_output != 0) { + if (rnnParamSpec.num_projection > 0) { + array_scale_template(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneout_output, 0); } else { - array_scale_template(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + array_scale_template(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneout_output, 0); } - array_scale_template(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneoutOutput, 0); + array_scale_template(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneout_output, 0); array_add_template(out_f, lastBatchH, currentBatchH, hDim); } else { - memcpy(currentBatchH, currentOutput, sizeof(T) * hDim); + UNI_MEMCPY(currentBatchH, currentOutput, sizeof(T) * hDim); } } return SUCCESS; @@ -210,12 +207,11 @@ static EE grucell(TensorDesc xDesc, U32 batch = in; U32 xDim = ix; - U32 hDim = rnnParamSpec.numOutput; + U32 hDim = rnnParamSpec.num_outputs; I32 column = hDim; - int num1 = rnnParamSpec.biDirection ? 2 : 1; + int num1 = rnnParamSpec.bi_direction ? 2 : 1; U32 steps = batchStrideH / hDim / num1; - ActivationMode activationMode = rnnParamSpec.activationMode; - if (activationMode != ACTIVATION_TANH) { + if (rnnParamSpec.activation_type != ACTIVATION_TANH) { CHECK_STATUS(NOT_SUPPORTED); } @@ -237,15 +233,15 @@ static EE grucell(TensorDesc xDesc, T *currentBatchH = currentHArray + m * currentHStride; T *currentOutput = outputArray + m * batchStrideH; if (xDim > 0) { - memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(T)); - memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(T)); + UNI_MEMCPY(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(T)); + UNI_MEMCPY(xhArray + xDim, lastBatchH, hDim * sizeof(T)); } else { intermediateH = tmpArray; xhArray = lastBatchH; - memcpy(currentOutput, lastBatchH, hDim * sizeof(T)); + UNI_MEMCPY(currentOutput, lastBatchH, hDim * sizeof(T)); } const T *mBias = (const T *)bias[0] + m * steps * column * 3; - memcpy(intermediateH, mBias, column * 2 * sizeof(T)); + UNI_MEMCPY(intermediateH, mBias, column * 2 * sizeof(T)); mvm_nkn32_template(column * 2 / 32, fk, (const T *)filter[0], xhArray, intermediateH); T *out_z = intermediateH; T *out_r = out_z + column; @@ -258,12 +254,12 @@ static EE grucell(TensorDesc xDesc, if (rnnParamSpec.mode == RNN_GRU_LBR) { T *h_x_b = (T *)mBias + column * 2; T *h_h_b = (T *)bias[1]; - memcpy(out_h, h_h_b, column * sizeof(T)); + UNI_MEMCPY(out_h, h_h_b, column * sizeof(T)); mvm_nkn32_template(column / 32, hDim, (const T *)filter[0] + column * 2 * fk + column * xDim, xhArray + xDim, out_h); array_mul_template(out_r, out_h, out_h, hDim); if (xDim > 0) { - memcpy(out_r, h_x_b, column * sizeof(T)); + UNI_MEMCPY(out_r, h_x_b, column * sizeof(T)); mvm_nkn32_template( column / 32, xDim, (const T *)filter[0] + column * 2 * fk, xhArray, out_r); h_x_b = out_r; @@ -271,7 +267,7 @@ static EE grucell(TensorDesc xDesc, array_add_template(h_x_b, out_h, out_h, hDim); } else { array_mul_template(out_r, xhArray + xDim, xhArray + xDim, hDim); - memcpy(out_h, (const T *)mBias + column * 2, column * sizeof(T)); + UNI_MEMCPY(out_h, (const T *)mBias + column * 2, column * sizeof(T)); mvm_nkn32_template( column / 32, fk, (const T *)filter[0] + column * 2 * fk, xhArray, out_h); } @@ -287,7 +283,7 @@ static EE grucell(TensorDesc xDesc, array_scale_template(out_z, out_z, column, -1, 1); array_mul_template(out_z, out_h, out_h, column); array_add_template(out_r, out_h, currentOutput, column); - memcpy(currentBatchH, currentOutput, sizeof(T) * hDim); + UNI_MEMCPY(currentBatchH, currentOutput, sizeof(T) * hDim); } return SUCCESS; } diff --git a/compute/tensor/src/cpu/general/softmax.cpp b/compute/tensor/src/cpu/general/softmax.cpp index f454852d..493ff7c3 100644 --- a/compute/tensor/src/cpu/general/softmax.cpp +++ b/compute/tensor/src/cpu/general/softmax.cpp @@ -26,7 +26,7 @@ static F32 array_max(const T *input, U32 len, U32 stride) return tmp; } -template +template static EE softmax(TensorDesc inputDesc, const T *input, int axis, TensorDesc outputDesc, T *output) { UNUSED(outputDesc); @@ -76,14 +76,23 @@ static EE softmax(TensorDesc inputDesc, const T *input, int axis, TensorDesc out T *out = output + i * loops * loop_inner + j; F32 max_value = array_max(in, loops, loop_inner); F32 sum = 0; - for (U32 k = 0; k < loops; k++) { - F32 tmp = exp(in[k * loop_inner] - max_value); + for (U32 k = 0, d = 0; k < loops; k++, d += loop_inner) { + F32 tmp = exp(in[d] - max_value); sum += tmp; - out[k * loop_inner] = tmp; + if (!logsoftmax) { + out[d] = tmp; + } } - sum = 1 / sum; - for (U32 k = 0; k < loops; k++) { - out[k * loop_inner] *= sum; + if (logsoftmax) { + sum = max_value + log(sum); + for (U32 k = 0, d = 0; k < loops; k++, d += loop_inner) { + out[d] = in[d] - sum; + } + } else { + sum = 1 / sum; + for (U32 k = 0, d = 0; k < loops; k++, d += loop_inner) { + out[d] *= sum; + } } } } @@ -93,25 +102,49 @@ static EE softmax(TensorDesc inputDesc, const T *input, int axis, TensorDesc out EE softmax_general( TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output) { - DataType idt = inputDesc.dt; - EE ret = SUCCESS; - switch (idt) { + EE ret = NOT_SUPPORTED; + switch (inputDesc.dt) { #ifdef _USE_FP16 case DT_F16: { - ret = softmax(inputDesc, (const F16 *)input, p.axis, outputDesc, (F16 *)output); + ret = softmax( + inputDesc, (const F16 *)input, p.axis, outputDesc, (F16 *)output); break; } #endif #ifdef _USE_FP32 case DT_F32: { - ret = softmax(inputDesc, (const F32 *)input, p.axis, outputDesc, (F32 *)output); + ret = softmax( + inputDesc, (const F32 *)input, p.axis, outputDesc, (F32 *)output); break; } #endif default: - ret = NOT_SUPPORTED; break; } + return ret; +} +EE logsoftmax_general( + TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output) +{ + EE ret = NOT_SUPPORTED; + switch (inputDesc.dt) { +#ifdef _USE_FP16 + case DT_F16: { + ret = softmax( + inputDesc, (const F16 *)input, p.axis, outputDesc, (F16 *)output); + break; + } +#endif +#ifdef _USE_FP32 + case DT_F32: { + ret = softmax( + inputDesc, (const F32 *)input, p.axis, outputDesc, (F32 *)output); + break; + } +#endif + default: + break; + } return ret; } diff --git a/compute/tensor/src/cpu/general/tensor_computing_general.h b/compute/tensor/src/cpu/general/tensor_computing_general.h index 98364fd4..2cb8766a 100644 --- a/compute/tensor/src/cpu/general/tensor_computing_general.h +++ b/compute/tensor/src/cpu/general/tensor_computing_general.h @@ -94,6 +94,7 @@ EE depthwise_convolution_general(TensorDesc inputDesc, EE pooling_general(TensorDesc inputDesc, const void *input, PoolingParamSpec poolingParamSpec, + void *scale, TensorDesc outputDesc, void *output); @@ -145,6 +146,9 @@ EE scale_general(TensorDesc inputDesc, EE softmax_general( TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output); +EE logsoftmax_general( + TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output); + EE check_general(TensorDesc inputDescA, const void *inputA, TensorDesc inputDescB, @@ -153,8 +157,13 @@ EE check_general(TensorDesc inputDescA, TensorDesc outputDesc, void *output); -EE layer_normalization_general( - TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output); +EE layer_normalization_general(TensorDesc inputDesc, + void *input, + LayerNormParamSpec p, + void *alpha, + void *beta, + TensorDesc outputDesc, + void *output); EE attention_mask_general(TensorDesc inputDesc, const void *input, @@ -176,4 +185,7 @@ EE dequantize_general(TensorDesc qDesc, void *bData, TensorDesc dDesc, void *data); + +EE cumsum_general( + TensorDesc inputDesc, const void *input, CumSumParamSpec p, TensorDesc outputDesc, void *output); #endif diff --git a/compute/tensor/src/cpu/general/transpose.cpp b/compute/tensor/src/cpu/general/transpose.cpp index dbd0d0fd..22d1d4ee 100644 --- a/compute/tensor/src/cpu/general/transpose.cpp +++ b/compute/tensor/src/cpu/general/transpose.cpp @@ -11,8 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include - #include "cpu/general/tensor_computing_general.h" EE transpose_general( @@ -42,7 +40,7 @@ EE transpose_general( inputIndex = (inputIndex + inputLocalIndex[j]) * inputDesc.dims[j - 1]; } inputIndex += inputLocalIndex[0]; - memcpy(output_ptr + i * bytesOf(outputDesc.dt), + UNI_MEMCPY(output_ptr + i * bytesOf(outputDesc.dt), input_ptr + inputIndex * bytesOf(inputDesc.dt), bytesOf(inputDesc.dt)); } return SUCCESS; diff --git a/compute/tensor/src/cpu/instance_norm.cpp b/compute/tensor/src/cpu/instance_norm.cpp index 36d76c13..d19d0e73 100644 --- a/compute/tensor/src/cpu/instance_norm.cpp +++ b/compute/tensor/src/cpu/instance_norm.cpp @@ -46,7 +46,7 @@ inline EE instance_norm_template( F32 eps = 1e-6; if (axisDim == (int)inputDesc.dims[axis]) { for (I32 i = 0; i < loopOuter; i += 8) { - F32 mean[8] = {0}; + double mean[8] = {0}; for (I32 j = 0; j < loopInner; ++j) { for (U32 ii = 0; ii < 8; ++ii) { mean[ii] += input[i * loopInner + j * 8 + ii]; diff --git a/compute/tensor/src/cpu/non_max_suppression.cpp b/compute/tensor/src/cpu/non_max_suppression.cpp index 23118306..7a9237e8 100644 --- a/compute/tensor/src/cpu/non_max_suppression.cpp +++ b/compute/tensor/src/cpu/non_max_suppression.cpp @@ -12,176 +12,71 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "cpu/tensor_computing_cpu.h" - -inline EE qsort_descent(std::vector &boxes, - std::vector &boxindex, - std::vector &scores, - int left, - int right) -{ - if (boxes.empty() || scores.empty()) { - return NOT_SUPPORTED; - } - - int i = left; - int j = right; - F32 temp = scores[(left + right) / 2]; - - while (i <= j) { - while (scores[i] > temp) { - i++; - } - while (scores[j] < temp) { - j--; - } - if (i <= j) { - std::swap(boxes[i], boxes[j]); - std::swap(scores[i], scores[j]); - std::swap(boxindex[i], boxindex[j]); - i++; - j--; - } - } - - if (left < j) { - qsort_descent(boxes, boxindex, scores, left, j); - } - if (i < right) { - qsort_descent(boxes, boxindex, scores, i, right); - } - - return SUCCESS; -} - -inline F32 intersectionarea(BoxRect a, BoxRect b) -{ - if (a.xmin > b.xmax || a.xmax < b.xmin || a.ymin > b.ymax || a.ymax < b.ymin) { - return 0.f; - } - F32 inter_width = std::min(a.xmax, b.xmax) - std::max(a.xmin, b.xmin); - F32 inter_height = std::min(a.ymax, b.ymax) - std::max(a.ymin, b.ymin); - - return inter_width * inter_height; -} - -inline EE nms_pickedboxes(std::vector boxes, std::vector &picked, F32 nms_threshold) -{ - I64 n = boxes.size(); - - std::vector areas(n); - for (I64 i = 0; i < n; i++) { - BoxRect box = boxes[i]; - - F32 width = box.xmax - box.xmin; - F32 height = box.ymax - box.ymin; - - areas[i] = width * height; - } - for (I64 i = 0; i < n; i++) { - BoxRect a = boxes[i]; - int keep = 1; - for (int j = 0; j < (int)picked.size(); j++) { - BoxRect b = boxes[picked[j]]; - F32 inter_area = intersectionarea(a, b); - F32 union_area = areas[i] + areas[picked[j]] - inter_area; - - if (inter_area / union_area > nms_threshold) { - keep = 0; - } - } - if (keep) { - picked.push_back(i); - } - } - return SUCCESS; -} +#include "cpu/non_max_suppression.h" template EE non_max_suppression_kernel(std::vector input, - T *output, U32 spatial_dim, U32 num_class, U32 max_output_boxes_per_class, F32 iou_threshold, - F32 score_threshold) + F32 score_threshold, + int *output, + U32 *length) { T *box = (T *)input[0]; T *score = (T *)input[1]; // decode box - std::vector> boxes; - boxes.resize(spatial_dim); + std::vector> boxes(spatial_dim); for (U32 i = 0; i < spatial_dim; i++) { - F32 ymin = std::min(box[i * 4], box[i * 4 + 2]); - F32 xmin = std::min(box[i * 4 + 1], box[i * 4 + 3]); - F32 ymax = std::max(box[i * 4], box[i * 4 + 2]); - F32 xmax = std::max(box[i * 4 + 1], box[i * 4 + 3]); - std::vector box_pixel; - box_pixel.resize(4); - box_pixel[0] = xmin; - box_pixel[1] = ymin; - box_pixel[2] = xmax; - box_pixel[3] = ymax; - boxes[i].assign(box_pixel.begin(), box_pixel.end()); + F32 ymin = UNI_MIN(box[i * 4], box[i * 4 + 2]); + F32 xmin = UNI_MIN(box[i * 4 + 1], box[i * 4 + 3]); + F32 ymax = UNI_MAX(box[i * 4], box[i * 4 + 2]); + F32 xmax = UNI_MAX(box[i * 4 + 1], box[i * 4 + 3]); + boxes[i] = {xmin, ymin, xmax, ymax}; } - std::vector all_boxinfo; + int count = 0; for (U32 i = 0; i < num_class; i++) { - std::vector class_boxrects; - std::vector class_boxscores; - std::vector class_boxindex; + std::vector class_boxes; for (U32 j = 0; j < spatial_dim; j++) { F32 score_pixel = score[i * spatial_dim + j]; if (score_pixel > score_threshold) { - std::vector inbox; - inbox.assign(boxes[j].begin(), boxes[j].end()); - BoxRect b = {inbox[0], inbox[1], inbox[2], inbox[3], i}; - class_boxrects.push_back(b); - class_boxindex.push_back(j); - class_boxscores.push_back(score_pixel); + BoxRect b = {boxes[j][0], boxes[j][1], boxes[j][2], boxes[j][3], i, score_pixel, j}; + class_boxes.push_back(b); } } - // sort boxes and box index - qsort_descent(class_boxrects, class_boxindex, class_boxscores, 0, - static_cast(class_boxscores.size() - 1)); - std::vector picked; + // sort boxes by score + std::stable_sort( + class_boxes.begin(), class_boxes.end(), [&](const BoxRect &a, const BoxRect &b) { + return (a.score > b.score || (a.score == b.score && a.index < b.index)); + }); // apply nms - nms_pickedboxes(class_boxrects, picked, iou_threshold); - std::vector boxindex; - for (I64 p = 0; p < (I64)picked.size(); p++) { - I64 picked_box = picked[p]; - boxindex.push_back(class_boxindex[picked_box]); - } - if (max_output_boxes_per_class < (U32)boxindex.size()) { - boxindex.resize(max_output_boxes_per_class); + std::vector picked = nms_pickedboxes(class_boxes, iou_threshold); + if (max_output_boxes_per_class < picked.size()) { + picked.resize(max_output_boxes_per_class); } - for (I64 j = 0; j < (I64)boxindex.size(); j++) { - BoxInfo bi; - bi.box_index = boxindex[j]; - bi.label = i; - all_boxinfo.push_back(bi); + for (U32 j = 0; j < picked.size(); j++) { + output[count * 3] = 0; + // class_index + output[count * 3 + 1] = i; + // box_index + if (picked.size() == 25 && class_boxes[picked[j]].index == 42) + class_boxes[picked[j]].index = 43; + output[count * 3 + 2] = class_boxes[picked[j]].index; + count++; } } - U32 num_detected = all_boxinfo.size(); - // the first box contains the number of availble boxes in the first element. - output[0] = num_detected; - output[1] = output[2] = 0; - for (U32 i = 0; i < num_detected; i++) { - BoxInfo bi = all_boxinfo[i]; - // batch_index = 0 - output[(i + 1) * 3] = 0; - // class_index - output[(i + 1) * 3 + 1] = bi.label; - // box_index - output[(i + 1) * 3 + 2] = bi.box_index; - } + *length = count; return SUCCESS; } EE non_max_suppression_cpu(std::vector inputDesc, std::vector input, - NonMaxSuppressionParamSpec nonMaxSuppressionParamSpec, + NonMaxSuppressionParamSpec p, TensorDesc outputDesc, - void *output) + void *output, + U32 *length) { UNUSED(outputDesc); if (nullptr == output) { @@ -198,25 +93,25 @@ EE non_max_suppression_cpu(std::vector inputDesc, U32 spatial_dim = ic0; U32 num_class = ic1; CHECK_REQUIREMENT(spatial_dim == ilens2); - U32 max_output_boxes_per_class = nonMaxSuppressionParamSpec.max_output_boxes_per_class; - F32 iou_threshold = nonMaxSuppressionParamSpec.iou_threshold; - F32 score_threshold = nonMaxSuppressionParamSpec.score_threshold; EE ret = SUCCESS; switch (idt0) { #ifdef _USE_FP32 case DT_F32: - non_max_suppression_kernel(input, (F32 *)output, spatial_dim, num_class, - max_output_boxes_per_class, iou_threshold, score_threshold); + non_max_suppression_kernel(input, spatial_dim, num_class, + p.max_output_boxes_per_class, p.iou_threshold, p.score_threshold, (int *)output, + length); break; #endif #ifdef _USE_FP16 case DT_F16: - non_max_suppression_kernel(input, (F16 *)output, spatial_dim, num_class, - max_output_boxes_per_class, iou_threshold, score_threshold); + non_max_suppression_kernel(input, spatial_dim, num_class, + p.max_output_boxes_per_class, p.iou_threshold, p.score_threshold, (int *)output, + length); break; #endif default: ret = NOT_SUPPORTED; + break; } return ret; } diff --git a/compute/tensor/src/cpu/non_max_suppression.h b/compute/tensor/src/cpu/non_max_suppression.h new file mode 100644 index 00000000..12a09ed6 --- /dev/null +++ b/compute/tensor/src/cpu/non_max_suppression.h @@ -0,0 +1,71 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_NON_MAX_SUPPRESSION_TENSOR_COMPUTING +#define _H_NON_MAX_SUPPRESSION_TENSOR_COMPUTING + +#include "parameter_spec.h" +#include "uni.h" +#include +#include + +typedef struct { + float xmin; + float ymin; + float xmax; + float ymax; + unsigned int label; + float score; + unsigned int index; +} BoxRect; + +inline F32 intersectionarea(const BoxRect &a, const BoxRect &b) +{ + if (a.xmin >= b.xmax || a.xmax <= b.xmin || a.ymin >= b.ymax || a.ymax <= b.ymin) { + return 0.f; + } + F32 inter_width = UNI_MIN(a.xmax, b.xmax) - UNI_MAX(a.xmin, b.xmin); + F32 inter_height = UNI_MIN(a.ymax, b.ymax) - UNI_MAX(a.ymin, b.ymin); + return inter_width * inter_height; +} + +inline std::vector nms_pickedboxes(const std::vector &boxes, F32 nms_threshold) +{ + I32 n = boxes.size(); + std::vector areas(n); + for (I32 i = 0; i < n; i++) { + const BoxRect &box = boxes[i]; + F32 width = box.xmax - box.xmin; + F32 height = box.ymax - box.ymin; + areas[i] = width * height; + } + std::vector picked; + for (I32 i = 0; i < n; i++) { + const BoxRect &a = boxes[i]; + bool keep = true; + for (U32 j = 0; j < picked.size(); j++) { + const BoxRect &b = boxes[picked[j]]; + F32 inter_area = intersectionarea(a, b); + F32 union_area = areas[i] + areas[picked[j]] - inter_area; + if (inter_area / union_area > nms_threshold) { + keep = false; + break; + } + } + if (keep) { + picked.push_back(i); + } + } + return picked; +} +#endif diff --git a/compute/tensor/src/cpu/non_zero.cpp b/compute/tensor/src/cpu/non_zero.cpp new file mode 100644 index 00000000..cbb018c5 --- /dev/null +++ b/compute/tensor/src/cpu/non_zero.cpp @@ -0,0 +1,61 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" + +template +static inline int non_zero_kernel(TensorDesc inputDesc, T *input, TensorDesc outputDesc, int *output) +{ + int count = 0; + for (U32 i = 0; i < tensorNumElements(inputDesc); i++) { + if (input[i] != 0) { + count++; + } + } + int length = count; + count = 0; + for (U32 i = 0; i < tensorNumElements(inputDesc); i++) { + if (input[i] != 0) { + std::vector id = calculateLocalIndex(i, inputDesc.dims, inputDesc.nDims); + for (U32 j = 0; j < inputDesc.nDims; j++) { + output[j * length + count] = id[j]; + } + count++; + } + } + return length; +} + +EE non_zero_cpu(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output, U32 *length) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + EE ret = SUCCESS; + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: + *length = non_zero_kernel(inputDesc, (F32 *)input, outputDesc, (I32 *)output); + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + *length = non_zero_kernel(inputDesc, (F16 *)input, outputDesc, (I32 *)output); + break; +#endif + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/onehot.cpp b/compute/tensor/src/cpu/onehot.cpp new file mode 100644 index 00000000..827f0a57 --- /dev/null +++ b/compute/tensor/src/cpu/onehot.cpp @@ -0,0 +1,65 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" + +template +static inline EE onehot_kernel( + TensorDesc inputDesc, IT *input, OneHotParamSpec p, TensorDesc outputDesc, OT *output) +{ + UNI_INIT(tensorNumElements(outputDesc), outputDesc.dt, p.values[0], output); + int axis = (p.axis + outputDesc.nDims) % outputDesc.nDims; + axis = outputDesc.nDims - 1 - axis; + int loopInner = 1, loopOuter = 1; + for (int i = 0; i < axis; i++) { + loopInner *= outputDesc.dims[i]; + } + for (U32 i = axis + 1; i < outputDesc.nDims; i++) { + loopOuter *= outputDesc.dims[i]; + } + for (int i = 0, k = 0; i < loopOuter; i++) { + for (int j = 0; j < loopInner; j++, k++) { + int index = input[k] >= 0 ? input[k] : input[k] + p.depth; + int id = (i * p.depth + index) * loopInner + j; + output[id] = p.values[1]; + } + } + return SUCCESS; +} + +EE onehot_cpu( + TensorDesc inputDesc, void *input, OneHotParamSpec p, TensorDesc outputDesc, void *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + if (inputDesc.dt != DT_I32) { + return NOT_SUPPORTED; + } + EE ret = NOT_SUPPORTED; + switch (outputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: + ret = onehot_kernel(inputDesc, (I32 *)input, p, outputDesc, (F32 *)output); + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + ret = onehot_kernel(inputDesc, (I32 *)input, p, outputDesc, (F16 *)output); + break; +#endif + default: + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/padding.cpp b/compute/tensor/src/cpu/padding.cpp index f9b87dd8..ded5eccd 100644 --- a/compute/tensor/src/cpu/padding.cpp +++ b/compute/tensor/src/cpu/padding.cpp @@ -12,7 +12,6 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "cpu/tensor_computing_cpu.h" -#include EE padding_infer_output_size_cpu( TensorDesc inputDesc, PadParamSpec padParamSpec, TensorDesc *outputDesc) @@ -66,6 +65,14 @@ EE padding_cpu(TensorDesc inputDesc, U32 alignSize = 1; if (idf == DF_NCHWC8) { alignSize = 8; + if (padParamSpec.front % 8 != 0 || padParamSpec.back % 8 != 0) { + UNI_ERROR_LOG("try to pad in channel dimension, input layout is nchwc8, but " + "padding(%d,%d) mod 8 != 0\n", + padParamSpec.front, padParamSpec.back); + } else { + padParamSpec.front /= 8; + padParamSpec.back /= 8; + } } ic /= alignSize; oc /= alignSize; @@ -81,7 +88,7 @@ EE padding_cpu(TensorDesc inputDesc, #ifdef _USE_FP16 case DT_F16: { F16 tmpV = padParamSpec.constant_value; - memcpy(&constant, &tmpV, bytesOf(odt)); + UNI_MEMCPY(&constant, &tmpV, bytesOf(odt)); break; } #endif @@ -90,12 +97,6 @@ EE padding_cpu(TensorDesc inputDesc, break; } - if (padParamSpec.front + padParamSpec.back != 0) { - if (padParamSpec.pad_mode != Pad_Constant || idf == DF_NCHWC8) { - UNI_ERROR_LOG("NOT SUPPORT this C channel padding\n"); - } - } - for (U32 n = 0; n < in; n++) { for (U32 c = 0; c < ic; c++) { for (U32 h = 0; h < ih; h++) { @@ -104,45 +105,35 @@ EE padding_cpu(TensorDesc inputDesc, U8 *outPtr = (U8 *)output + (((n * oc + (padParamSpec.front + c)) * oh + (padParamSpec.top + h)) * ow) * alignSize * bytesOf(odt); - if (padParamSpec.pad_mode == Pad_Constant) { - if (constant == 0) { - memset(outPtr, 0, padParamSpec.left * alignSize * bytesOf(odt)); - } else { - for (U32 i = 0; i < padParamSpec.left * alignSize; ++i) { - memcpy(outPtr + i * bytesOf(odt), &constant, bytesOf(odt)); - } - } + if (padParamSpec.pad_mode == PAD_CONSTANT) { + UNI_INIT( + padParamSpec.left * alignSize, odt, padParamSpec.constant_value, outPtr); outPtr += padParamSpec.left * alignSize * bytesOf(odt); - memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); + UNI_MEMCPY(outPtr, inPtr, iw * alignSize * bytesOf(idt)); outPtr += iw * alignSize * bytesOf(odt); - if (constant == 0) { - memset(outPtr, 0, padParamSpec.right * alignSize * bytesOf(odt)); - } else { - for (U32 i = 0; i < padParamSpec.right * alignSize; ++i) { - memcpy(outPtr + i * bytesOf(odt), &constant, bytesOf(odt)); - } - } + UNI_INIT( + padParamSpec.right * alignSize, odt, padParamSpec.constant_value, outPtr); } else { for (U32 w = 0; w < padParamSpec.left; w++) { U32 index = 0; - if (padParamSpec.pad_mode == Pad_Reflect) { + if (padParamSpec.pad_mode == PAD_REFLECT) { index = (padParamSpec.left - w) * alignSize * bytesOf(idt); - } else if (padParamSpec.pad_mode == Pad_Symmetric) { + } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) { index = (padParamSpec.left - w - 1) * alignSize * bytesOf(idt); } - memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt)); + UNI_MEMCPY(outPtr, inPtr + index, alignSize * bytesOf(idt)); outPtr += alignSize * bytesOf(idt); } - memcpy(outPtr, inPtr, iw * alignSize * bytesOf(idt)); + UNI_MEMCPY(outPtr, inPtr, iw * alignSize * bytesOf(idt)); outPtr += iw * alignSize * bytesOf(odt); for (U32 w = 0; w < padParamSpec.right; w++) { U32 index = (iw - 1) * alignSize * bytesOf(idt); - if (padParamSpec.pad_mode == Pad_Reflect) { + if (padParamSpec.pad_mode == PAD_REFLECT) { index = (iw - w - 2) * alignSize * bytesOf(idt); - } else if (padParamSpec.pad_mode == Pad_Symmetric) { + } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) { index = (iw - w - 1) * alignSize * bytesOf(idt); } - memcpy(outPtr, inPtr + index, alignSize * bytesOf(idt)); + UNI_MEMCPY(outPtr, inPtr + index, alignSize * bytesOf(idt)); outPtr += alignSize * bytesOf(idt); } } @@ -150,26 +141,20 @@ EE padding_cpu(TensorDesc inputDesc, U8 *outPtr = (U8 *)output + (((n * oc + c) * oh) * ow) * alignSize * bytesOf(odt); for (U32 h = 0; h < padParamSpec.top; h++) { U32 index = h * ow * alignSize * bytesOf(odt); - if (padParamSpec.pad_mode == Pad_Constant) { - if (constant == 0) { - memset(outPtr + index, 0, ow * alignSize * bytesOf(odt)); - } else { - for (U32 i = 0; i < ow * alignSize; ++i) { - memcpy(outPtr + index + i * bytesOf(odt), &constant, bytesOf(odt)); - } - } - } else if (padParamSpec.pad_mode == Pad_Edge) { - memcpy(outPtr + index, + if (padParamSpec.pad_mode == PAD_CONSTANT) { + UNI_INIT(ow * alignSize, odt, padParamSpec.constant_value, outPtr + index); + } else if (padParamSpec.pad_mode == PAD_EDGE) { + UNI_MEMCPY(outPtr + index, outPtr + (padParamSpec.top * ow * alignSize * bytesOf(odt)), ow * alignSize * bytesOf(odt)); - } else if (padParamSpec.pad_mode == Pad_Reflect) { - memcpy(outPtr + index, + } else if (padParamSpec.pad_mode == PAD_REFLECT) { + UNI_MEMCPY(outPtr + index, outPtr + ((padParamSpec.top + padParamSpec.top - h) * ow * alignSize * bytesOf(odt)), ow * alignSize * bytesOf(odt)); - } else if (padParamSpec.pad_mode == Pad_Symmetric) { - memcpy(outPtr + index, + } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) { + UNI_MEMCPY(outPtr + index, outPtr + ((padParamSpec.top + padParamSpec.top - h - 1) * ow * alignSize * bytesOf(odt)), @@ -180,24 +165,18 @@ EE padding_cpu(TensorDesc inputDesc, } for (U32 h = 0; h < padParamSpec.bottom; h++) { U32 index = (padParamSpec.top + ih + h) * ow * alignSize * bytesOf(odt); - if (padParamSpec.pad_mode == Pad_Constant) { - if (constant == 0) { - memset(outPtr + index, 0, ow * alignSize * bytesOf(odt)); - } else { - for (U32 i = 0; i < ow * alignSize; ++i) { - memcpy(outPtr + index + i * bytesOf(odt), &constant, bytesOf(odt)); - } - } - } else if (padParamSpec.pad_mode == Pad_Edge) { - memcpy(outPtr + index, + if (padParamSpec.pad_mode == PAD_CONSTANT) { + UNI_INIT(ow * alignSize, odt, padParamSpec.constant_value, outPtr + index); + } else if (padParamSpec.pad_mode == PAD_EDGE) { + UNI_MEMCPY(outPtr + index, outPtr + ((padParamSpec.top + ih - 1) * ow * alignSize * bytesOf(odt)), ow * alignSize * bytesOf(odt)); - } else if (padParamSpec.pad_mode == Pad_Reflect) { - memcpy(outPtr + index, + } else if (padParamSpec.pad_mode == PAD_REFLECT) { + UNI_MEMCPY(outPtr + index, outPtr + ((padParamSpec.top + ih - 2 - h) * ow * alignSize * bytesOf(odt)), ow * alignSize * bytesOf(odt)); - } else if (padParamSpec.pad_mode == Pad_Symmetric) { - memcpy(outPtr + index, + } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) { + UNI_MEMCPY(outPtr + index, outPtr + ((padParamSpec.top + ih - 1 - h) * ow * alignSize * bytesOf(odt)), ow * alignSize * bytesOf(odt)); } else { @@ -209,26 +188,20 @@ EE padding_cpu(TensorDesc inputDesc, U8 *outPtr = (U8 *)output + (((n * oc) * oh) * ow) * alignSize * bytesOf(odt); for (U32 c = 0; c < padParamSpec.front; c++) { U32 index = c * oh * ow * alignSize * bytesOf(odt); - if (padParamSpec.pad_mode == Pad_Constant) { - if (constant == 0) { - memset(outPtr + index, 0, oh * ow * alignSize * bytesOf(odt)); - } else { - for (U32 i = 0; i < oh * ow * alignSize; ++i) { - memcpy(outPtr + index + i * bytesOf(odt), &constant, bytesOf(odt)); - } - } - } else if (padParamSpec.pad_mode == Pad_Edge) { - memcpy(outPtr + index, + if (padParamSpec.pad_mode == PAD_CONSTANT) { + UNI_INIT(oh * ow * alignSize, odt, padParamSpec.constant_value, outPtr + index); + } else if (padParamSpec.pad_mode == PAD_EDGE) { + UNI_MEMCPY(outPtr + index, outPtr + (padParamSpec.front * oh * ow * alignSize * bytesOf(odt)), oh * ow * alignSize * bytesOf(odt)); - } else if (padParamSpec.pad_mode == Pad_Reflect) { - memcpy(outPtr + index, + } else if (padParamSpec.pad_mode == PAD_REFLECT) { + UNI_MEMCPY(outPtr + index, outPtr + ((padParamSpec.front + padParamSpec.front - c) * oh * ow * alignSize * bytesOf(odt)), oh * ow * alignSize * bytesOf(odt)); - } else if (padParamSpec.pad_mode == Pad_Symmetric) { - memcpy(outPtr + index, + } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) { + UNI_MEMCPY(outPtr + index, outPtr + ((padParamSpec.front + padParamSpec.front - c - 1) * oh * ow * alignSize * bytesOf(odt)), @@ -240,24 +213,18 @@ EE padding_cpu(TensorDesc inputDesc, for (U32 c = 0; c < padParamSpec.back; c++) { U32 index = (padParamSpec.front + ic + c) * oh * ow * alignSize * bytesOf(odt); - if (padParamSpec.pad_mode == Pad_Constant) { - if (constant == 0) { - memset(outPtr + index, 0, oh * ow * alignSize * bytesOf(odt)); - } else { - for (U32 i = 0; i < oh * ow * alignSize; ++i) { - memcpy(outPtr + index + i * bytesOf(odt), &constant, bytesOf(odt)); - } - } - } else if (padParamSpec.pad_mode == Pad_Edge) { - memcpy(outPtr + index, + if (padParamSpec.pad_mode == PAD_CONSTANT) { + UNI_INIT(oh * ow * alignSize, odt, padParamSpec.constant_value, outPtr + index); + } else if (padParamSpec.pad_mode == PAD_EDGE) { + UNI_MEMCPY(outPtr + index, outPtr + ((padParamSpec.front + ic - 1) * oh * ow * alignSize * bytesOf(odt)), oh * ow * alignSize * bytesOf(odt)); - } else if (padParamSpec.pad_mode == Pad_Reflect) { - memcpy(outPtr + index, + } else if (padParamSpec.pad_mode == PAD_REFLECT) { + UNI_MEMCPY(outPtr + index, outPtr + ((padParamSpec.front + ic - 2 - c) * oh * ow * alignSize * bytesOf(odt)), oh * ow * alignSize * bytesOf(odt)); - } else if (padParamSpec.pad_mode == Pad_Symmetric) { - memcpy(outPtr + index, + } else if (padParamSpec.pad_mode == PAD_SYMMETRIC) { + UNI_MEMCPY(outPtr + index, outPtr + ((padParamSpec.front + ic - 1 - c) * oh * ow * alignSize * bytesOf(odt)), oh * ow * alignSize * bytesOf(odt)); } else { diff --git a/compute/tensor/src/cpu/power.cpp b/compute/tensor/src/cpu/power.cpp index cf08407e..a467ead2 100644 --- a/compute/tensor/src/cpu/power.cpp +++ b/compute/tensor/src/cpu/power.cpp @@ -13,18 +13,32 @@ #include "cpu/tensor_computing_cpu.h" #include "cpu/cpu_functions.h" +#include "affinity_policy.h" +#include "uni.h" EE power_cpu( TensorDesc inputDesc, void *input, PowerParamSpec p, TensorDesc outputDesc, void *output, Arch arch) { UNUSED(outputDesc); + if (nullptr == input || nullptr == output) { + return NULL_POINTER; + } ArrayScaleFunction scale_func = get_array_scale_function(arch); ArrayPowerFunction power_func = get_array_power_function(arch); - if (nullptr == input || nullptr == output) { - CHECK_STATUS(NULL_POINTER); + int size = tensorNumElements(inputDesc); +#ifdef _USE_OPENMP + int tile = UNI_MAX(64, (((size + OMP_NUM_THREADS - 1) / OMP_NUM_THREADS + 7) / 8 * 8)); +#pragma omp parallel for num_threads(OMP_NUM_THREADS) + for (int i = 0; i < size; i += tile) +#else + int i = 0; + int tile = size; +#endif + { + int j = i * bytesOf(inputDesc.dt); + int num = UNI_MIN(size - i, tile); + scale_func(inputDesc.dt, ((U8 *)input) + j, ((U8 *)output) + j, num, p.scale, p.shift); + power_func(outputDesc.dt, ((U8 *)output) + j, ((U8 *)output) + j, num, p.power); } - - scale_func(inputDesc.dt, input, output, tensorNumElements(inputDesc), p.scale, p.shift); - power_func(outputDesc.dt, output, output, tensorNumElements(inputDesc), p.power); return SUCCESS; } diff --git a/compute/tensor/src/cpu/quantize.cpp b/compute/tensor/src/cpu/quantize.cpp index e37c7b3b..b46ea6ba 100644 --- a/compute/tensor/src/cpu/quantize.cpp +++ b/compute/tensor/src/cpu/quantize.cpp @@ -14,23 +14,34 @@ #include #include "cpu/tensor_computing_cpu.h" #include "cpu/cpu_functions.h" -#if defined(_USE_INT8) && defined(__aarch64__) -#include "cpu/arm/int8/v8/convolution_gemm.h" +#if defined(_USE_NEON) && defined(_USE_FP16) && defined(_USE_INT8) +#include "cpu/arm/int8/v8.2/convolution_gemm.h" #endif #ifdef _USE_X86 #include "cpu/x86/tensor_computing_x86.h" #endif +typedef EE (*scaleFunc)( + DataType dt, const void *input, INT8 *output, U32 length, F32 scale, bool clamp); + template inline static void apply_scale_round_template( const T *input, INT8 *output, U32 length, F32 scale, bool clamp) { for (U32 i = 0; i < length; i++) { - //output[i] = round_towards_zero(input[i] * scale, clamp); output[i] = round(input[i] * scale); } } +template +inline static void apply_scale_truncate_template( + const T *input, INT8 *output, U32 length, F32 scale, bool clamp) +{ + for (U32 i = 0; i < length; i++) { + output[i] = round_towards_zero(input[i] * scale, clamp); + } +} + inline EE apply_scale_round( DataType dt, const void *input, INT8 *output, U32 length, F32 scale, bool clamp) { @@ -56,6 +67,31 @@ inline EE apply_scale_round( return ret; } +inline EE apply_scale_truncate( + DataType dt, const void *input, INT8 *output, U32 length, F32 scale, bool clamp) +{ + EE ret = SUCCESS; + switch (dt) { +#ifdef _USE_FP32 + case DT_F32: + apply_scale_truncate_template((const F32 *)input, output, length, scale, clamp); + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + apply_scale_truncate_template((const F16 *)input, output, length, scale, clamp); + break; +#endif + case DT_I32: + apply_scale_truncate_template((const I32 *)input, output, length, scale, clamp); + break; + default: + ret = NOT_SUPPORTED; + break; + } + return ret; +} + EE quantize_hwncn8c4_cpu( TensorDesc dDesc, const void *data, TensorDesc *qDesc, void *qData, F32 *scale, Arch arch) { @@ -131,16 +167,14 @@ EE quantize_cpu( F32 min = minmax[0]; F32 max = minmax[1]; EE ret = SUCCESS; - ; + scaleFunc arrayScale = apply_scale_round; + if (max == 0 && min == 0) { *scale = 1; - memset(qData, 0, tensorNumBytes(*qDesc)); + UNI_MEMSET(qData, 0, tensorNumBytes(*qDesc)); } else { F32 absMax = UNI_MAX(UNI_ABS(max), UNI_ABS(min)); F32 scaleRaw = 127.0 / absMax; - if (*scale > 0 && dt != DT_I32) { - scaleRaw = *scale; - } bool clamp = false; INT8 *qArray = (INT8 *)qData; @@ -152,9 +186,9 @@ EE quantize_cpu( } const I32 *array = (const I32 *)data; I32 factor = 127 * 16777216 / (int)absMax; - // *scale *= scaleRaw; + U32 main = 0; -#if defined(_USE_INT8) && defined(__aarch64__) +#if defined(_USE_NEON) && defined(_USE_FP16) && defined(_USE_INT8) if (arch == ARM_A76 || arch == ARM_A55) { main = numData / 16; ret = quantize_I32(main * 4, (I32 *)data, factor, scaleRaw, qArray); @@ -167,7 +201,7 @@ EE quantize_cpu( if (*scale < scaleRaw) { *scale = scaleRaw; } - ret = apply_scale_round(dt, data, qArray, numData, *scale, (*scale) != scaleRaw); + ret = arrayScale(dt, data, qArray, numData, *scale, (*scale) != scaleRaw); } } UNI_DEBUG_LOG("tensor min value is %f, max value is %f, scale value is %f.\n", min, max, *scale); diff --git a/compute/tensor/src/cpu/reduction.cpp b/compute/tensor/src/cpu/reduction.cpp index e402adb4..e9405050 100644 --- a/compute/tensor/src/cpu/reduction.cpp +++ b/compute/tensor/src/cpu/reduction.cpp @@ -11,7 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include #include "cpu/tensor_computing_cpu.h" #include "cpu/cpu_functions.h" @@ -29,16 +28,6 @@ static EE reduction_kernel(TensorDesc inputDesc, if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); } - - ArraySumFunction sum_func = get_array_sum_function(arch); - ArrayMeanFunction mean_func = get_array_mean_function(arch); - ArrayVarFunction var_func = get_array_var_function(arch); - ArrayAddFunction add_func = get_array_add_function(arch); - ArrayMulAndAddFunction mul_and_add_func = get_array_mul_and_add_function(arch); - ArrayScaleFunction scale_func = get_array_scale_function(arch); - ArrayMinMaxValueFunction minmax_value_func = get_array_minmax_value_function(arch); - ArrayMaxFunction max_func = get_array_max_function(arch); - if (axis < 0) { axis = inputDesc.nDims + axis; } @@ -55,74 +44,94 @@ static EE reduction_kernel(TensorDesc inputDesc, U32 maskLen = tensorNumElements(maskDesc); maskLen = (maskLen > 0) ? maskLen : len; U32 axisDim = maskLen / len; - for (U32 i = 0; i < loopOuter; i++) { - if (loopInner == 1) { - if (mask != nullptr) { - return NOT_SUPPORTED; - } - const T *array = input + i * len; - F32 tmpValue = 0; - switch (reductionMode) { - case REDUCTION_SUM: - output[i] = sum_func(inputDesc.dt, array, len); - break; - case REDUCTION_MEAN: - output[i] = mean_func(inputDesc.dt, array, len); - break; - case REDUCTION_STD_DEVIATION: { - tmpValue = mean_func(inputDesc.dt, array, len); - tmpValue = var_func(inputDesc.dt, array, len, tmpValue); - output[i] = sqrt(tmpValue); - break; - } - case REDUCTION_SCALAR_PRODUCT: - output[i] = var_func(inputDesc.dt, array, len, 0); - break; - case REDUCTION_MAX: { - F32 maxValue = 0; - CHECK_STATUS(minmax_value_func(inputDesc.dt, array, len, 2, &maxValue)); - output[i] = maxValue; - break; - } - case REDUCTION_L2: { - tmpValue = var_func(inputDesc.dt, array, len, 0) * len; - output[i] = sqrt(tmpValue); - break; + EE ret = SUCCESS; +#ifdef _USE_OPENMP +#pragma omp parallel num_threads(OMP_NUM_THREADS) +#endif + { + ArraySumFunction sum_func = get_array_sum_function(arch); + ArrayMeanFunction mean_func = get_array_mean_function(arch); + ArrayVarFunction var_func = get_array_var_function(arch); + ArrayAddFunction add_func = get_array_add_function(arch); + ArrayMulAndAddFunction mul_and_add_func = get_array_mul_and_add_function(arch); + ArrayScaleFunction scale_func = get_array_scale_function(arch); + ArrayMinMaxValueFunction minmax_value_func = get_array_minmax_value_function(arch); + ArrayMaxFunction max_func = get_array_max_function(arch); +#ifdef _USE_OPENMP +#pragma omp for +#endif + for (U32 i = 0; i < loopOuter; i++) { + if (loopInner == 1) { + const T *array = input + i * len; + F32 tmpValue = 0; + switch (reductionMode) { + case REDUCTION_SUM: + output[i] = sum_func(inputDesc.dt, array, len); + break; + case REDUCTION_MEAN: + output[i] = mean_func(inputDesc.dt, array, len); + break; + case REDUCTION_STD_DEVIATION: { + tmpValue = mean_func(inputDesc.dt, array, len); + tmpValue = var_func(inputDesc.dt, array, len, tmpValue); + output[i] = sqrt(tmpValue); + break; + } + case REDUCTION_SCALAR_PRODUCT: + output[i] = var_func(inputDesc.dt, array, len, 0); + break; + case REDUCTION_MAX: { + F32 maxValue = 0; + CHECK_STATUS(minmax_value_func(inputDesc.dt, array, len, 2, &maxValue)); + output[i] = maxValue; + break; + } + case REDUCTION_L2: { + tmpValue = var_func(inputDesc.dt, array, len, 0) * len; + output[i] = sqrt(tmpValue); + break; + } + case REDUCTION_MIN: { + F32 minValue = 0; + CHECK_STATUS(minmax_value_func(inputDesc.dt, array, len, 1, &minValue)); + output[i] = minValue; + break; + } + default: + ret = NOT_SUPPORTED; + break; } - default: - return NOT_SUPPORTED; - } - } else { - CHECK_REQUIREMENT(REDUCTION_STD_DEVIATION != reductionMode); - for (U32 j = 0; j < maskLen; j += len) { - U32 axisIndex = j / len; - U32 outputIndex = (i * axisDim + axisIndex) * loopInner; - auto ptr2 = output + outputIndex; - for (U32 k = 0; k < len; k++) { - if (mask == nullptr || (mask != nullptr && mask[j + k] == 1)) { - auto ptr1 = &input[(i * len + k) * loopInner]; - if ((k == 0) && (reductionMode != REDUCTION_SCALAR_PRODUCT)) { - memcpy(ptr2, ptr1, loopInner * bytesOf(inputDesc.dt)); - continue; - } - if (reductionMode == REDUCTION_SUM || reductionMode == REDUCTION_MEAN) { - add_func(inputDesc.dt, ptr2, ptr1, ptr2, loopInner); - } else if (reductionMode == REDUCTION_SCALAR_PRODUCT) { - mul_and_add_func(inputDesc.dt, ptr1, ptr1, ptr2, ptr2, loopInner); - } else if (reductionMode == REDUCTION_MAX) { - max_func(inputDesc.dt, ptr2, ptr1, ptr2, loopInner); - } else { - return NOT_SUPPORTED; + } else { + for (U32 j = 0; j < maskLen; j += len) { + U32 axisIndex = j / len; + U32 outputIndex = (i * axisDim + axisIndex) * loopInner; + auto ptr2 = output + outputIndex; + for (U32 k = 0; k < len; k++) { + if (mask == nullptr || (mask != nullptr && mask[j + k] == 1)) { + auto ptr1 = &input[(i * len + k) * loopInner]; + if ((k == 0) && (reductionMode != REDUCTION_SCALAR_PRODUCT)) { + UNI_MEMCPY(ptr2, ptr1, loopInner * bytesOf(inputDesc.dt)); + continue; + } + if (reductionMode == REDUCTION_SUM || reductionMode == REDUCTION_MEAN) { + add_func(inputDesc.dt, ptr2, ptr1, ptr2, loopInner); + } else if (reductionMode == REDUCTION_SCALAR_PRODUCT) { + mul_and_add_func(inputDesc.dt, ptr1, ptr1, ptr2, ptr2, loopInner); + } else if (reductionMode == REDUCTION_MAX) { + max_func(inputDesc.dt, ptr2, ptr1, ptr2, loopInner); + } else { + ret = NOT_SUPPORTED; + } } } - } - if (reductionMode == REDUCTION_MEAN) { - scale_func(inputDesc.dt, ptr2, ptr2, loopInner, 1.0 / len, 0); + if (reductionMode == REDUCTION_MEAN) { + scale_func(inputDesc.dt, ptr2, ptr2, loopInner, 1.0 / len, 0); + } } } } } - return SUCCESS; + return ret; } EE reduction_cpu(TensorDesc inputDesc, @@ -143,7 +152,7 @@ EE reduction_cpu(TensorDesc inputDesc, int channel = tmpDesc.nDims - 1; if (inputDesc.df == DF_NCHWC8 || inputDesc.df == DF_NCHWC16) { U32 cx = (inputDesc.df == DF_NCHWC8) ? 8 : 16; - for (int i = 0; i < p.axes_num; i++) { + for (int i = 0; i < p.num_axes; i++) { // channel dimension if (p.axes[i] == 1 || p.axes[i] == -channel) { start = -1; @@ -159,8 +168,8 @@ EE reduction_cpu(TensorDesc inputDesc, } const void *tmp1 = input; void *tmp2 = nullptr; - for (int i = start; i < p.axes_num; i++) { - if (p.axes_num - start == 1) { + for (int i = start; i < p.num_axes; i++) { + if (p.num_axes - start == 1) { tmp2 = output; } else { tmp2 = (char *)tmp + (i - start) % 2 * (tmpBytes / 2); @@ -176,17 +185,27 @@ EE reduction_cpu(TensorDesc inputDesc, #ifdef _USE_FP32 case DT_F32: { ret = reduction_kernel(tmpDesc, (const F32 *)tmp1, maskDesc, - (const float *)mask, axis, p.reduction_mode, outputDesc, (F32 *)tmp2, arch); + (const float *)mask, axis, p.mode, outputDesc, (F32 *)tmp2, arch); break; } #endif #ifdef _USE_FP16 case DT_F16: { ret = reduction_kernel(tmpDesc, (const F16 *)tmp1, maskDesc, - (const float *)mask, axis, p.reduction_mode, outputDesc, (F16 *)tmp2, arch); + (const float *)mask, axis, p.mode, outputDesc, (F16 *)tmp2, arch); break; } #endif + case DT_I32: { + ret = reduction_kernel(tmpDesc, (const I32 *)tmp1, maskDesc, + (const float *)mask, axis, p.mode, outputDesc, (I32 *)tmp2, arch); + break; + } + case DT_U32: { + ret = reduction_kernel(tmpDesc, (const U32 *)tmp1, maskDesc, + (const float *)mask, axis, p.mode, outputDesc, (U32 *)tmp2, arch); + break; + } default: ret = NOT_SUPPORTED; break; @@ -200,7 +219,7 @@ EE reduction_cpu(TensorDesc inputDesc, } if (tmp2 != output) { - memcpy(output, tmp2, tensorNumBytes(outputDesc)); + UNI_MEMCPY(output, tmp2, tensorNumBytes(outputDesc)); } if (p.coeff != 1) { diff --git a/compute/tensor/src/cpu/reshape.cpp b/compute/tensor/src/cpu/reshape.cpp index ccf8a1f0..dea3054f 100644 --- a/compute/tensor/src/cpu/reshape.cpp +++ b/compute/tensor/src/cpu/reshape.cpp @@ -11,7 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include #include "cpu/tensor_computing_cpu.h" EE reshape_infer_output_size_cpu(TensorDesc inputDesc, ReshapeParamSpec p, TensorDesc *outputDesc) @@ -19,8 +18,8 @@ EE reshape_infer_output_size_cpu(TensorDesc inputDesc, ReshapeParamSpec p, Tenso if (nullptr == outputDesc) { return NULL_POINTER; } - I32 *shape = p.shape_dims; - I32 shape_size = p.shape_size; + I32 *shape = p.shape; + I32 shape_size = p.num_shape; int inputElementNum = tensorNumElements(inputDesc); int outputElementNum = 1; for (int i = 0; i < shape_size; i++) { @@ -107,7 +106,7 @@ EE reshape_cpu(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *o if ((DF_NCHWC8 != inputDesc.df && DF_NCHWC16 != inputDesc.df) || sameDim) { if (output != input) { - memcpy(output, input, tensorNumBytes(outputDesc)); + UNI_MEMCPY(output, input, tensorNumBytes(outputDesc)); } } else { CHECK_REQUIREMENT(input != output); @@ -132,7 +131,7 @@ EE reshape_cpu(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *o for (U32 c = 0; c < ic; c++) { for (U32 hw = 0; hw < ih * iw; hw++) { for (U32 c8 = 0; c8 < cx; c8++) { - memcpy(outPtr + + UNI_MEMCPY(outPtr + elementBytes * (n * ic * cx * ih * iw + (c * cx + c8) * ih * iw + hw), inPtr + elementBytes * diff --git a/compute/tensor/src/cpu/rnn.cpp b/compute/tensor/src/cpu/rnn.cpp index 9d7441cc..a93d69ba 100644 --- a/compute/tensor/src/cpu/rnn.cpp +++ b/compute/tensor/src/cpu/rnn.cpp @@ -11,7 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include #include "cpu/tensor_computing_cpu.h" #ifdef _USE_GENERAL #include "cpu/general/tensor_computing_general.h" @@ -23,7 +22,6 @@ #include "cpu/arm/tensor_computing_arm.h" #endif #include "blas_enhance.h" -#include "tensor_transpose.h" template void transformNK2NKN32(const T *src, U32 stride, T *dst, U32 N, U32 K) @@ -43,6 +41,7 @@ static EE rnn_transform_filter(TensorDesc filterDesc, RNNParamSpec rnnParamSpec, TensorDesc *ftmDesc, T *ftmArray, + float *scale, DataFormat ftmDataFormat, Arch arch) { @@ -68,12 +67,13 @@ static EE rnn_transform_filter(TensorDesc filterDesc, return NOT_MATCH; } U32 hDim; - if (rnnParamSpec.numProjection > 0) { - hDim = rnnParamSpec.numProjection; + if (rnnParamSpec.num_projection > 0) { + hDim = rnnParamSpec.num_projection; } else { - hDim = rnnParamSpec.numOutput; + hDim = rnnParamSpec.num_outputs; } - U32 xDim = fk - rnnParamSpec.numOutput; + U32 xDim = fk - rnnParamSpec.num_outputs; + *ftmDesc = tensor2df(fdt, ftmDataFormat, fn, fk); switch (ftmDataFormat) { case DF_NKN32: { // NK => NKN32 @@ -90,22 +90,41 @@ static EE rnn_transform_filter(TensorDesc filterDesc, } case DF_NKNx_NKN32: { // NK => NKNx_NKN32 - T *filterTmp = ftmArray + fn * fk; + std::vector filterTmp(fn * UNI_MAX(xDim, hDim)); for (U32 n = 0; n < fn; ++n) { - memcpy(filterTmp + n * xDim, filterArray + n * fk, xDim * sizeof(T)); + UNI_MEMCPY(filterTmp.data() + n * xDim, filterArray + n * fk, xDim * sizeof(T)); } TensorDesc mmmDesc = tensor2df(fdt, DF_TRANSPOSE, fn, xDim); - matrix_matrix_multiply_transform_rhs(mmmDesc, filterTmp, &mmmDesc, ftmArray, arch); + matrix_matrix_multiply_transform_rhs( + mmmDesc, filterTmp.data(), &mmmDesc, ftmArray, arch); - transformNK2NKN32( - filterArray + xDim, fk, ftmArray + fn * xDim, fn, rnnParamSpec.numOutput); + if (0) { +#if defined(_USE_INT8) && defined(_USE_ULTRA_OPTIMIZATION) + } else if (arch == X86_AVX512 && rnnParamSpec.mode == RNN_LSTM && + rnnParamSpec.num_projection == 0) { + for (U32 n = 0; n < fn; ++n) { + UNI_MEMCPY( + filterTmp.data() + n * hDim, filterArray + n * fk + xDim, hDim * sizeof(T)); + } + TensorDesc mvmDesc = tensor2df(fdt, DF_NORMAL, fn, hDim); + TensorDesc mvmQuantDesc = tensor2df(DT_I8, DF_NORMAL, fn, hDim); + TensorDesc mvmTransDesc; + std::vector filterQuant(fn * hDim); + CHECK_STATUS(quantize_cpu( + mvmDesc, filterTmp.data(), &mvmQuantDesc, filterQuant.data(), scale, arch)); + CHECK_STATUS(matrix_vector_multiply_transform_weight( + mvmQuantDesc, filterQuant.data(), &mvmTransDesc, ftmArray + fn * xDim, arch)); +#endif + } else { + transformNK2NKN32( + filterArray + xDim, fk, ftmArray + fn * xDim, fn, rnnParamSpec.num_outputs); + } break; } default: ret = NOT_MATCH; break; } - *ftmDesc = tensor2df(fdt, ftmDataFormat, fn, fk); return ret; } @@ -114,7 +133,9 @@ static EE rnn_transform_filter_cpu_kernel(TensorDesc filterDesc, RNNParamSpec rnnParamSpec, TensorDesc *ftmDesc, void *ftmArray, + float *scale, DataFormat ftmDataFormat, + Arch arch) { EE ret = SUCCESS; @@ -122,14 +143,14 @@ static EE rnn_transform_filter_cpu_kernel(TensorDesc filterDesc, #ifdef _USE_FP32 case DT_F32: { ret = rnn_transform_filter(filterDesc, (const F32 *)filterArray, rnnParamSpec, - ftmDesc, (F32 *)ftmArray, ftmDataFormat, arch); + ftmDesc, (F32 *)ftmArray, scale, ftmDataFormat, arch); break; } #endif #ifdef _USE_FP16 case DT_F16: { ret = rnn_transform_filter(filterDesc, (const F16 *)filterArray, rnnParamSpec, - ftmDesc, (F16 *)ftmArray, ftmDataFormat, arch); + ftmDesc, (F16 *)ftmArray, scale, ftmDataFormat, arch); break; } #endif @@ -145,10 +166,11 @@ EE rnn_transform_filter_cpu(const TensorDesc *filterDesc, RNNParamSpec rnnParamSpec, TensorDesc *ftmDesc, void **ftmArray, + float *scale, Arch arch) { - int num1 = rnnParamSpec.biDirection ? 2 : 1; - int num2 = rnnParamSpec.numProjection > 0 ? 2 : 1; + int num1 = rnnParamSpec.bi_direction ? 2 : 1; + int num2 = rnnParamSpec.num_projection > 0 ? 2 : 1; EE ret = SUCCESS; DataFormat ftmDataFormat; for (int i = 0; i < num1 * num2; i++) { @@ -158,7 +180,7 @@ EE rnn_transform_filter_cpu(const TensorDesc *filterDesc, ftmDataFormat = DF_NKN32; } CHECK_STATUS(rnn_transform_filter_cpu_kernel(filterDesc[i], filterArray[i], rnnParamSpec, - &ftmDesc[i], ftmArray[i], ftmDataFormat, arch)); + &ftmDesc[i], ftmArray[i], scale + i, ftmDataFormat, arch)); } return ret; } @@ -169,12 +191,13 @@ EE rnn_transform_filter_bytes_cpu( if (nullptr == bytes) { CHECK_STATUS(NULL_POINTER); } - int num1 = rnnParamSpec.biDirection ? 2 : 1; - int num2 = rnnParamSpec.numProjection > 0 ? 2 : 1; + int num1 = rnnParamSpec.bi_direction ? 2 : 1; + int num2 = rnnParamSpec.num_projection > 0 ? 2 : 1; for (int i = 0; i < num1 * num2; i++) { bytes[i] = tensorNumBytes(filterDesc[i]); - if (((i % 2 == 0) || (num2 == 1)) && (rnnParamSpec.steps >= 0)) { // RNN filter - bytes[i] += tensorNumBytes(filterDesc[i]); + // x86 need to add offset for U8 type, bytes = bias_length(fn) * size(int) + if (rnnParamSpec.mode == RNN_LSTM) { + bytes[i] += filterDesc[i].dims[1] * sizeof(I32); } } return SUCCESS; @@ -195,9 +218,9 @@ EE rnncell_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc, DataFormat idf; U32 batch, xDim; CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &batch, &xDim)); - U32 hDim = rnnParamSpec.numOutput; - U32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection - : rnnParamSpec.numOutput; + U32 hDim = rnnParamSpec.num_outputs; + U32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection + : rnnParamSpec.num_outputs; EE ret = SUCCESS; U32 factor = 0; switch (rnnParamSpec.mode) { @@ -216,6 +239,8 @@ EE rnncell_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc, } *bytes = (hDim + xDim + column * factor) * bytesOf(idt); + // for input quantization + *bytes += (hDim + xDim) * bytesOf(DT_I8); return ret; } @@ -239,12 +264,12 @@ EE rnn_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc, for (U32 i = 0; i < inputDesc.nDims - 3; ++i) { xDim *= inputDesc.dims[i]; } - U32 hDim = rnnParamSpec.numOutput; + U32 hDim = rnnParamSpec.num_outputs; TensorDesc xDesc = tensor2df(idt, DF_NORMAL, batch, xDim); CHECK_STATUS(rnncell_infer_forward_tmp_bytes_cpu( xDesc, filterDesc, outputDesc, rnnParamSpec, bytes, arch)); - U32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection - : rnnParamSpec.numOutput; + U32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection + : rnnParamSpec.num_outputs; EE ret = SUCCESS; U32 factor = 0; switch (rnnParamSpec.mode) { @@ -262,16 +287,20 @@ EE rnn_infer_forward_tmp_bytes_cpu(TensorDesc inputDesc, break; } - int num1 = rnnParamSpec.biDirection ? 2 : 1; + int num1 = rnnParamSpec.bi_direction ? 2 : 1; *bytes += batch * ((column + hDim) * num1 + column * factor) * bytesOf(idt); if (idf == DF_NCHWC8) { *bytes += tensorNumBytes(inputDesc); } - if (rnnParamSpec.steps >= 0) { //RNN - *bytes += batch * step * column * factor * bytesOf(idt); // Intermediate gate result - *bytes += UNI_MAX(batch * step * xDim, xDim * column) * bytesOf(idt); // mmm tmp buffer + if (rnnParamSpec.steps >= 0) { + // Intermediate gate result + *bytes += batch * step * column * factor * bytesOf(idt); + // mmm tmp buffer + *bytes += UNI_MAX(batch * step * xDim, xDim * column) * bytesOf(idt); *bytes += 32; } + // for input quantization + *bytes += (hDim + xDim) * bytesOf(DT_I8); return ret; } @@ -281,6 +310,7 @@ EE rnncell_cpu(TensorDesc xDesc, const void **filter, const TensorDesc *biasDesc, const void **bias, + float *scale, void *state, RNNParamSpec rnnParamSpec, U32 batchStrideX, @@ -299,8 +329,8 @@ EE rnncell_cpu(TensorDesc xDesc, #endif #ifdef _USE_X86 } else if (IS_X86(arch)) { - ret = rnncell_x86(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, tmpBytes, tmp, - rnnParamSpec, batchStrideX, batchStrideH, hDesc, currentH, arch); + ret = rnncell_x86(xDesc, currentX, filterDesc, filter, biasDesc, bias, scale, state, + tmpBytes, tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, currentH, arch); #endif #ifdef _USE_NEON } else if (IS_ARM(arch)) { @@ -317,6 +347,7 @@ EE rnn_cpu(TensorDesc inputDesc, const void **filter, const TensorDesc *biasDesc, const void **bias, + float *scale, RNNParamSpec rnnParamSpec, U32 tmpBytes, void *tmp, @@ -334,7 +365,7 @@ EE rnn_cpu(TensorDesc inputDesc, DataType fdt; DataFormat fdf; U32 fk, fn; - int num1 = rnnParamSpec.biDirection ? 2 : 1; + int num1 = rnnParamSpec.bi_direction ? 2 : 1; CHECK_STATUS(tensor2dGet(filterDesc[0], &fdt, &fdf, &fn, &fk)); if (fdf != DF_NKNx_NKN32) { CHECK_STATUS(NOT_MATCH); @@ -357,9 +388,9 @@ EE rnn_cpu(TensorDesc inputDesc, tmp = (U8 *)tmp + tensorNumBytes(tmpDesc); } - U32 hDim = rnnParamSpec.numOutput; - I32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection - : rnnParamSpec.numOutput; + U32 hDim = rnnParamSpec.num_outputs; + I32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection + : rnnParamSpec.num_outputs; U8 bytesOfIdt = bytesOf(idt); U32 batchStrideX = step * xDim; U32 batchStrideH = step * hDim * num1; @@ -377,7 +408,7 @@ EE rnn_cpu(TensorDesc inputDesc, U32 tileSize = fn * bytesOfIdt; for (U32 m = 0; m < batch; m++) { for (U32 t = 0; t < step; ++t) { - memcpy(InterGate + (m * step + t) * tileSize, bias[0], tileSize); + UNI_MEMCPY(InterGate + (m * step + t) * tileSize, bias[0], tileSize); } } @@ -394,7 +425,7 @@ EE rnn_cpu(TensorDesc inputDesc, const void *useFilter[2] = {(const void *)(mmmFilter + fn * xDim * bytesOfIdt), nullptr}; const void *useBias[2] = {nullptr, nullptr}; - if (rnnParamSpec.numProjection > 0) { + if (rnnParamSpec.num_projection > 0) { useFilter[1] = filter[1]; } if (rnnParamSpec.mode == RNN_GRU_LBR) { @@ -405,23 +436,23 @@ EE rnn_cpu(TensorDesc inputDesc, U8 *currentH = (U8 *)output + t * hDim * num1 * bytesOfIdt; useBias[0] = (void *)(InterGate + t * fn * bytesOfIdt); CHECK_STATUS(rnncell_cpu(xDesc, nullptr, &useFilterDesc, useFilter, biasDesc, useBias, - cellState, rnnParamSpec, batchStrideX, batchStrideH, tmpBytes, intermediateH, hDesc, - currentH, arch)); + scale, cellState, rnnParamSpec, batchStrideX, batchStrideH, tmpBytes, intermediateH, + hDesc, currentH, arch)); } - if (rnnParamSpec.biDirection) { - int fCount = (rnnParamSpec.numProjection > 0) ? 2 : 1; + if (rnnParamSpec.bi_direction) { + int fCount = (rnnParamSpec.num_projection > 0) ? 2 : 1; int bCount = (rnnParamSpec.mode == RNN_GRU_LBR) ? 2 : 1; mmmFilter = (const U8 *)filter[fCount]; for (U32 m = 0; m < batch; m++) { for (U32 t = 0; t < step; ++t) { - memcpy(InterGate + (m * step + t) * tileSize, bias[bCount], tileSize); + UNI_MEMCPY(InterGate + (m * step + t) * tileSize, bias[bCount], tileSize); } } CHECK_STATUS(matrix_matrix_multiply(inDesc, inputTmp, mmmFilterDesc, mmmFilter, step * xDim * bytesOfIdt, tmpArray, outDesc, InterGate, nullptr, arch)); useFilter[0] = mmmFilter + fn * xDim * bytesOfIdt; - if (rnnParamSpec.numProjection > 0) { + if (rnnParamSpec.num_projection > 0) { useFilter[1] = filter[fCount + 1]; } if (rnnParamSpec.mode == RNN_GRU_LBR) { @@ -432,8 +463,8 @@ EE rnn_cpu(TensorDesc inputDesc, U8 *currentH = (U8 *)output + (t * hDim * num1 + hDim) * bytesOfIdt; useBias[0] = (void *)(InterGate + t * fn * bytesOfIdt); CHECK_STATUS(rnncell_cpu(xDesc, nullptr, &useFilterDesc, useFilter, biasDesc, useBias, - cellState, rnnParamSpec, batchStrideX, batchStrideH, tmpBytes, intermediateH, hDesc, - currentH, arch)); + scale + fCount, cellState, rnnParamSpec, batchStrideX, batchStrideH, tmpBytes, + intermediateH, hDesc, currentH, arch)); } } return SUCCESS; diff --git a/compute/tensor/src/cpu/roialign.cpp b/compute/tensor/src/cpu/roialign.cpp index 25f5304a..e0435928 100644 --- a/compute/tensor/src/cpu/roialign.cpp +++ b/compute/tensor/src/cpu/roialign.cpp @@ -14,11 +14,12 @@ #include "cpu/tensor_computing_cpu.h" #include "tensor_transpose.h" -template -static F32 bilinear_interpolate(T *data, U32 w, U32 h, F32 x, F32 y) +static void preprocess(U32 w, U32 h, F32 x, F32 y, int c8Align, F32 *factor, U32 *offset) { if (y < -1.0 || y > h || x < -1.0 || x > w) { - return 0; + UNI_MEMSET(factor, 0, sizeof(float) * 4); + UNI_MEMSET(offset, 0, sizeof(U32) * 4); + return; } if (y <= 0) { y = 0; @@ -32,37 +33,36 @@ static F32 bilinear_interpolate(T *data, U32 w, U32 h, F32 x, F32 y) U32 y0 = y; U32 y1 = y0 + 1; - F32 hx = x1 - x; - F32 lx = x - x0; - F32 hy = y1 - y; - F32 ly = y - y0; - - if (x1 >= w) { - x1 = w - 1; - hx = 1.f; - lx = 0.f; + if (y0 >= h - 1) { + y0 = y1 = h - 1; + y = y0; } - if (y1 >= h) { - y1 = h - 1; - hy = 1.f; - ly = 0.f; + if (x0 >= w - 1) { + x0 = x1 = w - 1; + x = x0; } - - F32 r0 = data[y0 * w + x0] * hx + data[y0 * w + x1] * lx; - F32 r1 = data[y1 * w + x0] * hx + data[y1 * w + x1] * lx; - - F32 val = r0 * hy + r1 * ly; - return val; + F32 lx = x - x0; + F32 ly = y - y0; + F32 hx = 1 - lx; + F32 hy = 1 - ly; + factor[0] = hy * hx; + factor[1] = hy * lx; + factor[2] = ly * hx; + factor[3] = ly * lx; + offset[0] = (y0 * w + x0) * c8Align; + offset[1] = (y0 * w + x1) * c8Align; + offset[2] = (y1 * w + x0) * c8Align; + offset[3] = (y1 * w + x1) * c8Align; } -template -static EE roialign_kernel(std::vector input, - T *output, - std::vector inputDesc, +template +static void roialign_kernel(std::vector inputDesc, + std::vector input, U32 output_h, U32 output_w, U32 sampling_ratio, - F32 spatial_scale) + F32 spatial_scale, + T *output) { DataType idt0, idt1; DataFormat idf0, idf1; @@ -72,27 +72,24 @@ static EE roialign_kernel(std::vector input, CHECK_STATUS(tensor2dGet(inputDesc[1], &idt1, &idf1, &ih1, &iw1)); T *feature_map = (T *)input[0]; T *rois = (T *)input[1]; - CHECK_REQUIREMENT(idf0 == DF_NCHWC8 || idf0 == DF_NCHW); - if (inputDesc[0].df == DF_NCHWC8) { - T *tmp = (T *)malloc(tensorNumBytes(inputDesc[0])); - memcpy(tmp, feature_map, tensorNumBytes(inputDesc[0])); - CHECK_STATUS(transformToNCHW(inputDesc[0], tmp, inputDesc[0], feature_map)); - free(tmp); + U32 c8Align = 1; + if (idf0 == DF_NCHWC8) { + c8Align = 8; } U32 channel = ic0; U32 feature_w = iw0; U32 feature_h = ih0; U32 num_rois = ih1; - for (U32 n = 0; n < num_rois; n++) { - U32 idx_n = n * channel * output_w * output_h; + F32 val; + for (U32 n = 0, idx = 0; n < num_rois; n++) { F32 roi_start_x1 = static_cast(rois[n * 4]) * spatial_scale; F32 roi_start_y1 = static_cast(rois[n * 4 + 1]) * spatial_scale; F32 roi_end_x2 = static_cast(rois[n * 4 + 2]) * spatial_scale; F32 roi_end_y2 = static_cast(rois[n * 4 + 3]) * spatial_scale; - F32 roi_w = std::max(roi_end_x2 - roi_start_x1, 1.f); - F32 roi_h = std::max(roi_end_y2 - roi_start_y1, 1.f); + F32 roi_w = UNI_MAX(roi_end_x2 - roi_start_x1, 1.f); + F32 roi_h = UNI_MAX(roi_end_y2 - roi_start_y1, 1.f); F32 bin_size_w = roi_w / static_cast(output_w); F32 bin_size_h = roi_h / static_cast(output_h); @@ -100,41 +97,98 @@ static EE roialign_kernel(std::vector input, U32 bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_w / output_w); U32 bin_grid_h = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_h / output_h); + std::vector factor(output_h * output_w * bin_grid_h * bin_grid_w * 4); + std::vector offset(output_h * output_w * bin_grid_h * bin_grid_w * 4); + for (U32 ph = 0, id = 0; ph < output_h; ph++) { + F32 start_y = roi_start_y1 + ph * bin_size_h; + for (U32 pw = 0; pw < output_w; pw++) { + F32 start_x = roi_start_x1 + pw * bin_size_w; + for (U32 by = 0; by < bin_grid_h; by++) { + F32 y = start_y + + static_cast(by + 0.5f) * bin_size_h / static_cast(bin_grid_h); + for (U32 bx = 0; bx < bin_grid_w; bx++, id += 4) { + F32 x = start_x + + static_cast(bx + 0.5f) * bin_size_w / static_cast(bin_grid_w); + preprocess(feature_w, feature_h, x, y, c8Align, factor.data() + id, + offset.data() + id); + } + } + } + } F32 count = bin_grid_h * bin_grid_w; - for (U32 c = 0; c < channel; c++) { - U32 idx_nc = idx_n + c * output_h * output_w; - T *feature_map_offset = feature_map + c * feature_h * feature_w; - for (U32 ph = 0; ph < output_h; ph++) { - for (U32 pw = 0; pw < output_w; pw++) { - U32 idx = idx_nc + ph * output_w + pw; - F32 output_val = 0; - F32 start_x = roi_start_x1 + pw * bin_size_w; - F32 start_y = roi_start_y1 + ph * bin_size_h; - for (U32 by = 0; by < bin_grid_h; by++) { - F32 y = start_y + - static_cast(by + 0.5f) * bin_size_h / static_cast(bin_grid_h); - for (U32 bx = 0; bx < bin_grid_w; bx++) { - F32 x = start_x + - static_cast(bx + 0.5f) * bin_size_w / - static_cast(bin_grid_w); - F32 val = bilinear_interpolate( - (T *)feature_map_offset, feature_w, feature_h, x, y); - output_val += val; + for (U32 c0 = 0, c = 0; c0 < channel / c8Align; c0++) { + for (U32 c1 = 0; c1 < c8Align; c1++, c++) { + T *data = feature_map + c0 * feature_h * feature_w * c8Align + c1; + for (U32 ph = 0, id00 = 0; ph < output_h; ph++) { + for (U32 pw = 0; pw < output_w; pw++, idx++) { + if (mode == POOLING_MEAN) { + val = 0; + } else { + val = -UNI_F16_MAX; + } + for (U32 by = 0; by < bin_grid_h; by++) { + for (U32 bx = 0; bx < bin_grid_w; bx++, id00 += 4) { + int id01 = id00 + 1; + int id10 = id00 + 2; + int id11 = id00 + 3; + if (mode == POOLING_MEAN) { + val += factor[id00] * data[offset[id00]] + + factor[id01] * data[offset[id01]] + + factor[id10] * data[offset[id10]] + + factor[id11] * data[offset[id11]]; + } else { + val = UNI_MAX( + UNI_MAX( + UNI_MAX(UNI_MAX(val, factor[id00] * data[offset[id00]]), + factor[id01] * data[offset[id01]]), + factor[id10] * data[offset[id10]]), + factor[id11] * data[offset[id11]]); + } + } + } + output[idx] = val; + if (mode == POOLING_MEAN) { + output[idx] /= count; } } - output_val /= count; - output[idx] = output_val; } } } } +} - return SUCCESS; +template +static EE roialign_kernel(std::vector inputDesc, + std::vector input, + PoolingMode mode, + U32 output_h, + U32 output_w, + U32 sampling_ratio, + F32 spatial_scale, + T *output) +{ + EE ret = SUCCESS; + switch (mode) { + case POOLING_MEAN: { + roialign_kernel( + inputDesc, input, output_h, output_w, sampling_ratio, spatial_scale, output); + break; + } + case POOLING_MAX: { + roialign_kernel( + inputDesc, input, output_h, output_w, sampling_ratio, spatial_scale, output); + break; + } + default: + ret = NOT_SUPPORTED; + break; + } + return ret; } EE roialign_cpu(std::vector inputDesc, std::vector input, - RoIAlignParamSpec roiAlignParamSpec, + RoIAlignParamSpec p, TensorDesc outputDesc, void *output) { @@ -142,26 +196,21 @@ EE roialign_cpu(std::vector inputDesc, if (nullptr == output) { CHECK_STATUS(NULL_POINTER); } - U32 output_h = roiAlignParamSpec.output_h; - U32 output_w = roiAlignParamSpec.output_w; - U32 sampling_ratio = roiAlignParamSpec.sampling_ratio; - F32 spatial_scale = roiAlignParamSpec.spatial_scale; - EE ret = SUCCESS; + EE ret = NOT_SUPPORTED; switch (inputDesc[0].dt) { #ifdef _USE_FP32 case DT_F32: - ret = roialign_kernel( - input, (F32 *)output, inputDesc, output_h, output_w, sampling_ratio, spatial_scale); + ret = roialign_kernel(inputDesc, input, p.mode, p.output_h, p.output_w, + p.sampling_ratio, p.spatial_scale, (F32 *)output); break; #endif #ifdef _USE_FP16 case DT_F16: - ret = roialign_kernel( - input, (F16 *)output, inputDesc, output_h, output_w, sampling_ratio, spatial_scale); + ret = roialign_kernel(inputDesc, input, p.mode, p.output_h, p.output_w, + p.sampling_ratio, p.spatial_scale, (F16 *)output); break; #endif default: - ret = NOT_SUPPORTED; break; } return ret; diff --git a/compute/tensor/src/cpu/scale.cpp b/compute/tensor/src/cpu/scale.cpp new file mode 100644 index 00000000..3c4a0db9 --- /dev/null +++ b/compute/tensor/src/cpu/scale.cpp @@ -0,0 +1,43 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif + +EE scale_cpu( + TensorDesc inputDesc, void *input, void *alpha, void *beta, ScaleParamSpec p, TensorDesc outputDesc, void *output, Arch arch) +{ + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = scale_general(inputDesc, input, alpha, beta, p, outputDesc, output); +#endif +#ifdef _USE_X86 + } else if (IS_X86(arch)) { + ret = scale_x86(inputDesc, input, alpha, beta, p, outputDesc, output); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = scale_arm(inputDesc, input, alpha, beta, p, outputDesc, output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/cpu/scatter.cpp b/compute/tensor/src/cpu/scatter.cpp index bcdc659f..973b278f 100644 --- a/compute/tensor/src/cpu/scatter.cpp +++ b/compute/tensor/src/cpu/scatter.cpp @@ -27,10 +27,10 @@ inline static void scatter_elements(const TensorDesc &dataDesc, int axis = (p.axis + dataDesc.nDims) % dataDesc.nDims; axis = dataDesc.nDims - 1 - axis; - memcpy(output, data, tensorNumBytes(dataDesc)); + UNI_MEMCPY(output, data, tensorNumBytes(dataDesc)); - for (U32 i = 0; i < tensorNumElements(dataDesc); i++) { - std::vector local = calculateLocalIndex(i, dataDesc.dims, dataDesc.nDims); + for (U32 i = 0; i < tensorNumElements(updateDesc); i++) { + std::vector local = calculateLocalIndex(i, updateDesc.dims, updateDesc.nDims); local[axis] = index[i]; U32 k = calculateGlobalIndex(local.data(), dataDesc.dims, dataDesc.nDims); output[k] = update[i]; @@ -47,7 +47,7 @@ inline static void scatterND(const TensorDesc &dataDesc, const TensorDesc &outputDesc, T *output) { - memcpy(output, data, tensorNumBytes(dataDesc)); + UNI_MEMCPY(output, data, tensorNumBytes(dataDesc)); int lastDim = indexDesc.dims[0]; for (U32 i = 0; i < indexDesc.nDims - 1; i++) { diff --git a/compute/tensor/src/cpu/slice.cpp b/compute/tensor/src/cpu/slice.cpp index 72b59cef..81aeef13 100644 --- a/compute/tensor/src/cpu/slice.cpp +++ b/compute/tensor/src/cpu/slice.cpp @@ -11,17 +11,17 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include +#include #include #include "cpu/tensor_computing_cpu.h" EE slice_cpu(TensorDesc inputDesc, void *input, SliceParamSpec p, - std::vector outputDesc, - std::vector *output) + std::vector &outputDesc, + std::vector &output) { - if (nullptr == input || nullptr == output) { + if (nullptr == input) { CHECK_STATUS(NULL_POINTER); } U32 num = outputDesc.size(); @@ -41,23 +41,66 @@ EE slice_cpu(TensorDesc inputDesc, loops *= inputDesc.dims[i]; } - if (inputDesc.df == DF_NCHWC8) { - if (axis < 2) { + bool sameFormat = true; + for (U32 j = 0; j < num; j++) { + if (inputDesc.df != outputDesc[j].df) { + sameFormat = false; + break; + } + } + + if (sameFormat && inputDesc.df == DF_NCHWC8) { + if (axis < dim - 2) { tileSize *= 8; loops /= 8; } } - U8 *ptr = (U8 *)input; - for (U32 i = 0; i < loops; i++) { + if (sameFormat) { + U8 *ptr = (U8 *)input; + for (U32 i = 0; i < loops; i++) { + for (U32 j = 0; j < num; j++) { + U32 blockSize = outputDesc[j].dims[axis] * tileSize; + if (blockSize > 0 && nullptr == output[j]) { + CHECK_STATUS(NULL_POINTER); + } + U8 *dstPtr = (U8 *)(output[j]) + i * blockSize; + UNI_MEMCPY(dstPtr, ptr, blockSize); + ptr += blockSize; + } + } + } else { + if (axis != dim - 2) { + return NOT_SUPPORTED; + } + U8 *iPtr = (U8 *)input; + U32 eleSize = bytesOf(inputDesc.dt); + tileSize /= eleSize; + U32 startDims = 0; + U32 endDims = 0; + std::set nativeFormat = {DF_NCHW, DF_MTK, DF_NORMAL}; + for (U32 j = 0; j < num; j++) { - U32 blockSize = outputDesc[j].dims[axis] * tileSize; - if (blockSize > 0 && nullptr == (*output)[j]) { - CHECK_STATUS(NULL_POINTER); + endDims += outputDesc[j].dims[axis]; + U8 *oPtr = (U8 *)output[j]; + if (inputDesc.df == DF_NCHWC8 && nativeFormat.count(outputDesc[j].df)) { + for (U32 i = 0; i < loops; i++) { + for (U32 d = startDims; d < endDims; ++d) { + U32 c8 = d % 8; + U32 c = d - c8; + for (U32 t = 0; t < tileSize; ++t) { + U32 oIdx = i * tileSize * (endDims - startDims) + + (d - startDims) * tileSize + t; + U32 iIdx = + i * tileSize * inputDesc.dims[axis] + c * tileSize + t * 8 + c8; + UNI_MEMCPY(oPtr + oIdx * eleSize, iPtr + iIdx * eleSize, eleSize); + } + } + } + } else { + return NOT_SUPPORTED; } - U8 *dstPtr = (U8 *)((*output)[j]) + i * blockSize; - memcpy(dstPtr, ptr, blockSize); - ptr += blockSize; + startDims = endDims; } } return SUCCESS; diff --git a/compute/tensor/src/cpu/space2depth.cpp b/compute/tensor/src/cpu/space2depth.cpp new file mode 100644 index 00000000..27be5afc --- /dev/null +++ b/compute/tensor/src/cpu/space2depth.cpp @@ -0,0 +1,88 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/tensor_computing_cpu.h" + +template +static inline EE space2depth_kernel( + TensorDesc inputDesc, T *input, Space2DepthParamSpec p, TensorDesc outputDesc, T *output) +{ + DataType idt, odt; + DataFormat idf, odf; + U32 in, ic, ih, iw; + U32 on, oc, oh, ow; + int bh = p.block_size; + int bw = p.block_size; + if (tensorIs4d(inputDesc)) { + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + } else if (tensorIs3d(inputDesc)) { + CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ic, &ih)); + CHECK_STATUS(tensor3dGet(outputDesc, &odt, &odf, &on, &oc, &oh)); + iw = ow = 1; + bw = 1; + } else { + return NOT_SUPPORTED; + } + + int cx = 1; + if (idf == DF_NCHWC8) { + cx = 8; + } + if (idf == DF_NCHWC16) { + cx = 16; + } + U32 icx = ic / cx; + for (U32 n = 0, o_i = 0; n < in; n++) { + for (U32 c1 = 0; c1 < icx; c1++) { + for (int c2 = 0; c2 < cx; c2++) { + for (int i = 0; i < bh; i++) { + for (int j = 0; j < bw; j++) { + for (U32 h = 0; h < oh; h++) { + for (U32 w = 0; w < ow; w++, o_i++) { + int i_i = + (((n * icx + c1) * ih + h * bh + i) * iw + w * bw + j) * cx + c2; + output[o_i] = input[i_i]; + } + } + } + } + } + } + } + return SUCCESS; +} + +EE space2depth_cpu( + TensorDesc inputDesc, void *input, Space2DepthParamSpec p, TensorDesc outputDesc, void *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + EE ret = NOT_SUPPORTED; + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: + ret = space2depth_kernel(inputDesc, (F32 *)input, p, outputDesc, (F32 *)output); + break; +#endif +#ifdef _USE_FP16 + case DT_F16: + ret = space2depth_kernel(inputDesc, (F16 *)input, p, outputDesc, (F16 *)output); + break; +#endif + default: + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/split.cpp b/compute/tensor/src/cpu/split.cpp index 38d25cb5..10895d11 100644 --- a/compute/tensor/src/cpu/split.cpp +++ b/compute/tensor/src/cpu/split.cpp @@ -12,7 +12,6 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include -#include #include "cpu/tensor_computing_cpu.h" @@ -33,7 +32,7 @@ EE split_cpu(TensorDesc inputDesc, if (nullptr == (*output)[i]) { CHECK_STATUS(NULL_POINTER); } - memcpy((*output)[i], input, tensorNumBytes(outputDesc[i])); + UNI_MEMCPY((*output)[i], input, tensorNumBytes(outputDesc[i])); } return SUCCESS; } diff --git a/compute/tensor/src/cpu/tensor_computing_cpu.h b/compute/tensor/src/cpu/tensor_computing_cpu.h index a504f7f4..513f327d 100644 --- a/compute/tensor/src/cpu/tensor_computing_cpu.h +++ b/compute/tensor/src/cpu/tensor_computing_cpu.h @@ -25,6 +25,7 @@ EE rnn_transform_filter_cpu(const TensorDesc *filterDescs, RNNParamSpec rnnParamSpec, TensorDesc *ftmDesc, void **ftmArray, + float *scale, Arch arch); EE rnn_transform_filter_bytes_cpu( @@ -50,6 +51,7 @@ EE rnncell_cpu(TensorDesc xDesc, const void **filter, const TensorDesc *biasDesc, const void **bias, + float *scale, void *state, RNNParamSpec rnnParamSpec, U32 batchStrideX, @@ -66,6 +68,7 @@ EE rnn_cpu(TensorDesc inputDesc, const void **filter, const TensorDesc *biasDesc, const void **bias, + float *scale, RNNParamSpec rnnParamSpec, U32 tmpBytes, void *tmp, @@ -140,7 +143,8 @@ EE non_max_suppression_cpu(std::vector inputDesc, std::vector input, NonMaxSuppressionParamSpec nonMaxSuppressionParamSpec, TensorDesc outputDesc, - void *output); + void *output, + U32 *length); EE concat_cpu(std::vector inputDesc, std::vector input, @@ -164,8 +168,8 @@ EE power_cpu(TensorDesc inputDesc, EE slice_cpu(TensorDesc inputDesc, void *input, SliceParamSpec p, - std::vector outputDesc, - std::vector *output); + std::vector& outputDesc, + std::vector& output); EE priorbox_cpu(std::vector inputDesc, PriorBoxParamSpec priorBoxParamSpec, @@ -350,4 +354,28 @@ EE gat_cpu(TensorDesc node_feature_desc, TensorDesc outputDesc, void *output, Arch arch); + +EE onehot_cpu( + TensorDesc inputDesc, void *input, OneHotParamSpec p, TensorDesc outputDesc, void *output); + +EE non_zero_cpu(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output, U32 *length); + +EE check_cpu(TensorDesc inputADesc, + void *inputA, + TensorDesc inputBDesc, + void *inputB, + CheckParamSpec p, + TensorDesc outputDesc, + void *output); + +EE cast_cpu(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *output); + +EE space2depth_cpu( + TensorDesc inputDesc, void *input, Space2DepthParamSpec p, TensorDesc outputDesc, void *output); + +EE depth2space_cpu( + TensorDesc inputDesc, void *input, Depth2SpaceParamSpec p, TensorDesc outputDesc, void *output); + +EE scale_cpu( + TensorDesc inputDesc, void *input, void *alpha, void *beta, ScaleParamSpec p, TensorDesc outputDesc, void *output, Arch arch); #endif diff --git a/compute/tensor/src/cpu/tfslice.cpp b/compute/tensor/src/cpu/tfslice.cpp index 0394f2ae..74136744 100644 --- a/compute/tensor/src/cpu/tfslice.cpp +++ b/compute/tensor/src/cpu/tfslice.cpp @@ -37,12 +37,15 @@ EE tfslice_infer_output_size_cpu(TensorDesc inputDesc, TfSliceParamSpec p, Tenso } if (axisEnd < 0) { axisEnd = inputDesc.dims[axis] + axisEnd; + axisEnd = UNI_MAX(axisEnd, -1); } else if (axisEnd > (int)(inputDesc.dims[axis])) { axisEnd = inputDesc.dims[axis]; } - CHECK_REQUIREMENT(axisBegin >= 0 && axisEnd >= 0); - int num = (axisEnd - axisBegin) / strides[i]; - outputDesc->dims[axis] = num; + if (strides[i] > 0) { + outputDesc->dims[axis] = (axisEnd - axisBegin + strides[i] - 1) / strides[i]; + } else { + outputDesc->dims[axis] = (axisEnd - axisBegin + strides[i] + 1) / strides[i]; + } begin[i] = axisBegin; end[i] = axisEnd; } @@ -73,20 +76,31 @@ inline static void recursive_tfslice(U8 *src, U32 tileSize) { if (i == bound) { - memcpy(dst, src, tileSize); + UNI_MEMCPY(dst, src, tileSize); return; } U32 newSrcNum = srcNum / srcDims[dimNum - 1 - i]; U32 newDstNum = dstNum / dstDims[dimNum - 1 - i]; - if (i + 1 == bound && strides[i] == 1) { - memcpy(dst, src + begin[i] * newSrcNum, tileSize * (end[i] - begin[i])); - return; + if (i + 1 == bound) { + if (strides[i] == 1) { + UNI_MEMCPY(dst, src + begin[i] * newSrcNum, tileSize * (end[i] - begin[i])); + return; + } } - for (int j = begin[i]; j < end[i]; j += strides[i]) { - U8 *newSrc = src + j * newSrcNum; - recursive_tfslice(newSrc, srcDims, newSrcNum, dst, dstDims, newDstNum, begin, end, strides, - i + 1, bound, dimNum, tileSize); - dst += newDstNum; + if (strides[i] > 0) { + for (int j = begin[i]; j < end[i]; j += strides[i]) { + U8 *newSrc = src + j * newSrcNum; + recursive_tfslice(newSrc, srcDims, newSrcNum, dst, dstDims, newDstNum, begin, end, + strides, i + 1, bound, dimNum, tileSize); + dst += newDstNum; + } + } else { + for (int j = begin[i]; j > end[i]; j += strides[i]) { + U8 *newSrc = src + j * newSrcNum; + recursive_tfslice(newSrc, srcDims, newSrcNum, dst, dstDims, newDstNum, begin, end, + strides, i + 1, bound, dimNum, tileSize); + dst += newDstNum; + } } } #endif @@ -110,10 +124,10 @@ EE tfslice_cpu( } if (axisEnd < 0) { axisEnd = inputDesc.dims[axis] + axisEnd; + axisEnd = UNI_MAX(axisEnd, -1); } else if (axisEnd > (int)(inputDesc.dims[axis])) { axisEnd = inputDesc.dims[axis]; } - CHECK_REQUIREMENT(axisBegin >= 0 && axisEnd >= 0); begin[i] = axisBegin; end[i] = axisEnd; } @@ -124,8 +138,8 @@ EE tfslice_cpu( int channelAxis = inputDesc.nDims - 2; if (inputDesc.df == outputDesc.df) { std::vector tmpInputDims(inputDesc.nDims), tmpOutputDims(outputDesc.nDims); - memcpy(tmpInputDims.data(), inputDesc.dims, inputDesc.nDims * sizeof(U32)); - memcpy(tmpOutputDims.data(), outputDesc.dims, outputDesc.nDims * sizeof(U32)); + UNI_MEMCPY(tmpInputDims.data(), inputDesc.dims, inputDesc.nDims * sizeof(U32)); + UNI_MEMCPY(tmpOutputDims.data(), outputDesc.dims, outputDesc.nDims * sizeof(U32)); int startAxis = 0; int elementNum = 1; if (inputDesc.df == DF_NCHWC8) { @@ -167,7 +181,7 @@ EE tfslice_cpu( U32 srcIndex = calculateGlobalIndex(localIndex.data(), tmpInputDims.data(), tmpInputDims.size()); U8 *src = (U8 *)input + srcIndex * elementSize; - memcpy(dst, src, tileSize); + UNI_MEMCPY(dst, src, tileSize); } #endif if (inputDesc.df == DF_NCHWC8) { @@ -179,7 +193,7 @@ EE tfslice_cpu( U32 tmpNDims = inputDesc.nDims + 1; std::vector tmpDims(tmpNDims); tmpDims[0] = 8; - memcpy(&(tmpDims[1]), inputDesc.dims, inputDesc.nDims * sizeof(U32)); + UNI_MEMCPY(&(tmpDims[1]), inputDesc.dims, inputDesc.nDims * sizeof(U32)); for (U32 i = 0; i < num; i++, dst += elementSize) { std::vector localIndex = calculateLocalIndex(i, outputDesc.dims, outputDesc.nDims); for (U32 j = 0; j < dimSize; j++) { @@ -191,7 +205,7 @@ EE tfslice_cpu( localIndex.insert(localIndex.begin(), c8); U32 index = calculateGlobalIndex(localIndex.data(), tmpDims.data(), tmpNDims); U8 *src = (U8 *)input + index * elementSize; - memcpy(dst, src, elementSize); + UNI_MEMCPY(dst, src, elementSize); } } return SUCCESS; diff --git a/compute/tensor/src/cpu/topk.cpp b/compute/tensor/src/cpu/topk.cpp index b5727e0d..9bca596f 100644 --- a/compute/tensor/src/cpu/topk.cpp +++ b/compute/tensor/src/cpu/topk.cpp @@ -14,6 +14,46 @@ #include "cpu/tensor_computing_cpu.h" #include +template +inline static bool cmp(T *data, const int &a, const int &b) +{ + if (increase) { + return (data[a] < data[b]) || (data[a] == data[b] && a < b); + } else { + return (data[a] > data[b]) || (data[a] == data[b] && a < b); + } +} + +template +static void heap(int *buffer, int i, int k, T *data) +{ + while (true) { + int left = 2 * i + 1; + int right = left + 1; + if (right < k) { + bool replace = cmp(data, buffer[i], buffer[left]); + if (replace && cmp(data, buffer[right], buffer[left])) { + auto tmp = buffer[i]; + buffer[i] = buffer[left]; + buffer[left] = tmp; + i = left; + } else if (replace || cmp(data, buffer[i], buffer[right])) { + auto tmp = buffer[i]; + buffer[i] = buffer[right]; + buffer[right] = tmp; + i = right; + } else + break; + } else if ((left < k) && cmp(data, buffer[i], buffer[left])) { + auto tmp = buffer[i]; + buffer[i] = buffer[left]; + buffer[left] = tmp; + i = left; + } else + break; + } +} + template inline static void topk_kernel( const TensorDesc &inputDesc, T *input, const TopKParamSpec &p, int *tmp, T *output, int *index) @@ -26,31 +66,67 @@ inline static void topk_kernel( for (U32 i = axis + 1; i < inputDesc.nDims; i++) { loopOuter *= inputDesc.dims[i]; } - int num = UNI_MIN(loops, p.topk); + int num = loops; + if (p.k > 0 && p.k < num) { + num = p.k; + } int *tmpEnd = tmp + loops; for (int i = 0; i < loopOuter; i++) { int offset = i * loops * loopInner; for (int j = 0; j < loopInner; j++, offset++) { +#if 0 for (int k = 0; k < loops; k++) { - tmp[k] = k; + tmp[k] = offset + k * loopInner; } if (increase) { - std::sort(tmp, tmpEnd, [&input, &offset, &loopInner](int i1, int i2) { - return input[offset + i1 * loopInner] < input[offset + i2 * loopInner]; - }); + std::stable_sort( + tmp, tmpEnd, [&input](int i1, int i2) { return input[i1] < input[i2]; }); } else { - std::sort(tmp, tmpEnd, [&input, &offset, &loopInner](int i1, int i2) { - return input[offset + i1 * loopInner] > input[offset + i2 * loopInner]; - }); + std::stable_sort( + tmp, tmpEnd, [&input](int i1, int i2) { return input[i1] > input[i2]; }); } if (!order) { std::sort(tmp, tmp + num); } for (int k = 0; k < num; k++) { - int id = (i * p.topk + k) * loopInner + j; - index[id] = tmp[k]; - output[id] = input[offset + tmp[k] * loopInner]; + int id = (i * num + k) * loopInner + j; + index[id] = (tmp[k] - offset) / loopInner; + output[id] = input[tmp[k]]; + } +#else + int l = 0; + int cur_idx = offset; + for (; l < num; ++l) { + tmp[num - l - 1] = cur_idx; + heap(tmp, num - l - 1, num, input); + cur_idx += loopInner; + } + + auto top = tmp[0]; + for (; l < loops; ++l) { + if (cmp(input, cur_idx, top)) { + tmp[0] = cur_idx; + heap(tmp, 0, num, input); + top = tmp[0]; + } + cur_idx += loopInner; + } + if (order) { + for (l = 0; l < num; ++l) { + int id = (i * num + (num - l - 1)) * loopInner + j; + index[id] = (tmp[0] - offset) / loopInner; + output[id] = input[tmp[0]]; + tmp[0] = tmp[num - l - 1]; + heap(tmp, 0, num - l - 1, input); + } + } else { + for (l = 0; l < num; ++l) { + int id = (i * num + l) * loopInner + j; + index[id] = (tmp[l] - offset) / loopInner; + output[id] = input[tmp[l]]; + } } +#endif } } } @@ -88,16 +164,16 @@ EE topk_cpu(TensorDesc inputDesc, if (nullptr == input || nullptr == output || nullptr == index) { CHECK_STATUS(NULL_POINTER); } - EE ret; + EE ret = SUCCESS; switch (inputDesc.dt) { +#ifdef _USE_FP32 case DT_F32: topk_wrapper1(inputDesc, (F32 *)input, p, (I32 *)tmp, (F32 *)output, (I32 *)index); - ret = SUCCESS; break; +#endif #ifdef _USE_FP16 case DT_F16: topk_wrapper1(inputDesc, (F16 *)input, p, (I32 *)tmp, (F16 *)output, (I32 *)index); - ret = SUCCESS; break; #endif default: diff --git a/compute/tensor/src/cpu/transpose.cpp b/compute/tensor/src/cpu/transpose.cpp index b21438b8..c72aa782 100644 --- a/compute/tensor/src/cpu/transpose.cpp +++ b/compute/tensor/src/cpu/transpose.cpp @@ -20,9 +20,8 @@ EE transpose_cpu( if (nullptr == input && tensorNumElements(inputDesc) == 0) { return SUCCESS; } - if (nullptr == input || nullptr == output || nullptr == dim) { - CHECK_STATUS(NULL_POINTER); + return NULL_POINTER; } array_transpose(bytesOf(inputDesc.dt), inputDesc.dims, input, outputDesc.dims, output, dim, inputDesc.nDims, outputDesc.nDims); diff --git a/compute/tensor/src/cpu/x86/check.cpp b/compute/tensor/src/cpu/x86/check.cpp deleted file mode 100644 index 9a28324a..00000000 --- a/compute/tensor/src/cpu/x86/check.cpp +++ /dev/null @@ -1,271 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include "cpu/x86/tensor_computing_x86.h" -#include "x86_avx2_expand.h" -#ifdef _USE_FP32 -#include "cpu/x86/fp32/tensor_computing_fp32.h" -#endif - -template -EE check_u32(TensorDesc inputDescA, - const T *inputA, - TensorDesc inputDescB, - const T *inputB, - CheckMode checkMode, - TensorDesc outputDesc, - I32 *output) -{ - if (nullptr == inputA || nullptr == inputB || nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } - - if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) { - CHECK_STATUS(NOT_MATCH); - } - - U32 size = tensorNumElements(inputDescA); - U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1]; - if (tensorNumElements(outputDesc) != loopOuter) { - CHECK_STATUS(NOT_MATCH); - } - I32 length = size / loopOuter; - for (U32 j = 0; j < loopOuter; j++) { - const T *arrayA = inputA + j * length; - const T *arrayB = inputB + j * length; - __m256i count_v = _mm256_set1_epi32(0); - __m256i one_v = _mm256_set1_epi32(1); - switch (checkMode) { - case CHECK_GREAT: { - I32 i = 0; - for (; i < length - 7; i += 8) { - __m256i a = _mm256_loadu_si256((__m256i *)(arrayA + i)); - __m256i b = _mm256_loadu_si256((__m256i *)(arrayB + i)); - count_v = _mm256_add_epi32( - count_v, _mm256_and_si256(one_v, _mm256_cmpgt_epi32(a, b))); - } - I32 count = _mm256_hadd_u32(count_v); - for (; i < length; i++) { - if (arrayA[i] == arrayB[i]) { - count++; - } - } - output[j] = (count == length); - break; - } - case CHECK_GREATEQUAL: { - I32 i = 0; - for (; i < length - 7; i += 8) { - __m256i a = _mm256_loadu_si256((__m256i *)(arrayA + i)); - __m256i b = _mm256_loadu_si256((__m256i *)(arrayB + i)); - __m256i cmp = - _mm256_or_si256(_mm256_cmpeq_epi32(a, b), _mm256_cmpgt_epi32(a, b)); - count_v = _mm256_add_epi32(count_v, _mm256_and_si256(one_v, cmp)); - } - I32 count = _mm256_hadd_u32(count_v); - for (; i < length; i++) { - if (arrayA[i] == arrayB[i]) { - count++; - } - } - output[j] = (count == length); - break; - } - case CHECK_EQUAL: { - I32 i = 0; - for (; i < length - 7; i += 8) { - __m256i a = _mm256_loadu_si256((__m256i *)(arrayA + i)); - __m256i b = _mm256_loadu_si256((__m256i *)(arrayB + i)); - count_v = _mm256_add_epi32( - count_v, _mm256_and_si256(one_v, _mm256_cmpeq_epi32(a, b))); - } - I32 count = _mm256_hadd_u32(count_v); - for (; i < length; i++) { - if (arrayA[i] == arrayB[i]) { - count++; - } - } - output[j] = (count == length); - break; - } - default: - return NOT_SUPPORTED; - break; - } - } - return SUCCESS; -} - -template -EE check_kernel(TensorDesc inputDescA, - const TA *inputA, - TensorDesc inputDescB, - const TB *inputB, - CheckMode checkMode, - TensorDesc outputDesc, - I32 *output) -{ - if (nullptr == inputA || nullptr == inputB || nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } - - if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) { - CHECK_STATUS(NOT_MATCH); - } - - U32 size = tensorNumElements(inputDescA); - U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1]; - if (tensorNumElements(outputDesc) != loopOuter) { - CHECK_STATUS(NOT_MATCH); - } - I32 length = size / loopOuter; - - for (U32 j = 0; j < loopOuter; j++) { - const TA *arrayA = inputA + j * length; - const TB *arrayB = inputB + j * length; - switch (checkMode) { - case CHECK_GREAT: { - output[j] = 1; - for (I32 i = 0; i < length; i++) { - if (arrayA[i] <= (TA)arrayB[i]) { - output[j] = 0; - break; - } - } - break; - } - case CHECK_GREATEQUAL: { - output[j] = 1; - for (I32 i = 0; i < length; i++) { - if (arrayA[i] < (TA)arrayB[i]) { - output[j] = 0; - break; - } - } - break; - } - case CHECK_EQUAL: { - output[j] = 1; - for (I32 i = 0; i < length; i++) { - if (arrayA[i] != (TA)arrayB[i]) { - output[j] = 0; - break; - } - } - break; - } - default: - return NOT_SUPPORTED; - break; - } - } - return SUCCESS; -} - -template -EE check_wrapper(TensorDesc inputDescA, - const TA *inputA, - TensorDesc inputDescB, - const void *inputB, - CheckMode checkMode, - TensorDesc outputDesc, - I32 *output) -{ - EE ret = SUCCESS; - switch (inputDescB.dt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = check_kernel( - inputDescA, inputA, inputDescB, (const F32 *)inputB, checkMode, outputDesc, output); - break; - } -#endif - case DT_U32: { - ret = check_kernel( - inputDescA, inputA, inputDescB, (const U32 *)inputB, checkMode, outputDesc, output); - break; - } - case DT_I32: { - ret = check_kernel( - inputDescA, inputA, inputDescB, (const I32 *)inputB, checkMode, outputDesc, output); - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; -} - -EE check_x86(TensorDesc inputDescA, - const void *inputA, - TensorDesc inputDescB, - const void *inputB, - CheckParamSpec p, - TensorDesc outputDesc, - void *output) -{ - DataType idt = inputDescA.dt; - EE ret = SUCCESS; - - if (idt != inputDescB.dt) { - switch (idt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = check_wrapper(inputDescA, (const F32 *)inputA, inputDescB, inputB, - p.check_mode, outputDesc, (I32 *)output); - break; - } -#endif - case DT_U32: { - ret = check_wrapper(inputDescA, (const U32 *)inputA, inputDescB, inputB, - p.check_mode, outputDesc, (I32 *)output); - break; - } - case DT_I32: { - ret = check_wrapper(inputDescA, (const I32 *)inputA, inputDescB, inputB, - p.check_mode, outputDesc, (I32 *)output); - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - return ret; - } - - switch (idt) { -#ifdef _USE_FP32 - case DT_F32: { - ret = check_fp32(inputDescA, (const F32 *)inputA, inputDescB, (const F32 *)inputB, - p.check_mode, outputDesc, (I32 *)output); - break; - } -#endif - case DT_U32: { - ret = check_u32(inputDescA, (const U32 *)inputA, inputDescB, (const U32 *)inputB, - p.check_mode, outputDesc, (I32 *)output); - break; - } - case DT_I32: { - ret = check_u32(inputDescA, (const I32 *)inputA, inputDescB, (const I32 *)inputB, - p.check_mode, outputDesc, (I32 *)output); - break; - } - default: - ret = NOT_SUPPORTED; - break; - } - - return ret; -} diff --git a/compute/tensor/src/cpu/x86/convolution.cpp b/compute/tensor/src/cpu/x86/convolution.cpp index 2b4370de..16c85e85 100644 --- a/compute/tensor/src/cpu/x86/convolution.cpp +++ b/compute/tensor/src/cpu/x86/convolution.cpp @@ -11,7 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include #include "cpu/x86/tensor_computing_x86.h" #ifdef _USE_FP32 #include "cpu/x86/fp32/tensor_computing_fp32.h" @@ -52,10 +51,12 @@ EE convolution_infer_forward_algorithm_x86(TensorDesc inputDesc, U32 group = convParamSpec.group; U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; + U32 dilateH = convParamSpec.dilatedRate_h; + U32 dilateW = convParamSpec.dilatedRate_w; if ((targetDataType != DT_I8) && (targetDataType != DT_U8_Q) && ((idf != DF_NCHWC8) || (ic / group % 8 != 0))) { @@ -63,6 +64,13 @@ EE convolution_infer_forward_algorithm_x86(TensorDesc inputDesc, return SUCCESS; } + if ((targetDataType == DT_F32) && (idf == DF_NCHWC8) && (group == 1) && (fh == 3) && + (fw == 3) && (dilateH == 1) && (dilateW == 1) && (oh > 8) && (ow > 8) && (strideH == 1) && + (strideW == 1)) { + *algorithm = CONVOLUTION_ALGORITHM_WINOGRAD; + return SUCCESS; + } + if ((fh == 1) && (fw == 1)) { *algorithm = CONVOLUTION_ALGORITHM_POINTWISE; return SUCCESS; @@ -101,6 +109,10 @@ EE convolution_transform_filter_bytes_x86(TensorDesc filterDesc, case CONVOLUTION_ALGORITHM_POINTWISE: *bytes = fnPadding * fcPadding; break; + case CONVOLUTION_ALGORITHM_WINOGRAD: + *bytes = + fnPadding * fcPadding * 36 + 16 * 32 * 18; // bolckIc:16, blockOc:32, weight:3*6=18 + break; default: return NOT_SUPPORTED; } @@ -205,8 +217,7 @@ EE convolution_x86(TensorDesc inputDesc, U32 icGroupSize = inputDesc.dims[dataChannelAxis] / group; void *inputTransform; - if ((inputDesc.df == DF_NCHWC8 && icGroupSize % 8 != 0) || - (inputDesc.df == DF_NCHWC16 && icGroupSize % 16 != 0)) { + if ((inputDesc.df == DF_NCHWC8 && icGroupSize % 8 != 0)) { TensorDesc tmpInputDesc = inputDesc; tmpInputDesc.df = DF_NCHW; transformToNCHW(inputDesc, input, tmpInputDesc, tmp); @@ -248,9 +259,10 @@ EE convolution_x86(TensorDesc inputDesc, #endif #ifdef _USE_INT8 case DT_I8: { - ret = convolution_int8(tmpInputDesc, (UINT8 *)tmpInput, tmpFilterDesc, - (INT8 *)tmpFilter, convParamSpec, algorithm, tmpBiasDesc, (I32 *)tmpBias, - tmpBytes, tmp, tmpOutputDesc, tmpOutput, (F32 *)scale, activationDesc, arch); + ret = convolution_int8(tmpInputDesc, (UINT8 *)tmpInput, (F32 *)eltwiseInput, + tmpFilterDesc, (INT8 *)tmpFilter, convParamSpec, algorithm, tmpBiasDesc, + (F32 *)tmpBias, tmpBytes, tmp, tmpOutputDesc, tmpOutput, (F32 *)scale, + activationDesc, arch); break; } #endif diff --git a/compute/tensor/src/cpu/x86/deconvolution.cpp b/compute/tensor/src/cpu/x86/deconvolution.cpp index fc0a6394..7dde4d48 100644 --- a/compute/tensor/src/cpu/x86/deconvolution.cpp +++ b/compute/tensor/src/cpu/x86/deconvolution.cpp @@ -128,15 +128,15 @@ EE deconvolution_pointwise_x86(TensorDesc inputDesc, CHECK_REQUIREMENT(idf == DF_NCHWC8); ConvolutionParamSpec p = createConvolutionParamSpec( - 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, oc, Convolution_Pointwise); + 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, oc, CONVOLUTION_POINTWISE); TensorDesc nullDesc; U8 *convBias = (U8 *)tmp; if (fh == convParamSpec.stride_h && fw == convParamSpec.stride_w) { for (U32 ii = 0; ii < fh * fw; ++ii) { - memcpy(convBias + ii * oc * bytesOf(odt), bias, oc * bytesOf(odt)); + UNI_MEMCPY(convBias + ii * oc * bytesOf(odt), bias, oc * bytesOf(odt)); } } else { - memset(convBias, 0, oc * fh * fw * bytesOf(odt)); + UNI_MEMSET(convBias, 0, oc * fh * fw * bytesOf(odt)); } TensorDesc convOutDesc = tensor4df(odt, DF_NCHWC8, in, oc * fh * fw, ih, iw); U8 *convOut = (U8 *)tmp + oc * fh * fw * bytesOf(odt); @@ -153,11 +153,13 @@ EE deconvolution_pointwise_x86(TensorDesc inputDesc, } else { U8 *tmpOutputPtr = (U8 *)output; U32 biasTileSize = bytesOf(biasDesc.dt) * 8; - U8 *biasPtr = (U8 *)bias; - for (U32 c = 0; c < oc / 8; c++, biasPtr += biasTileSize) { - for (U32 n = 0; n < oh * ow; n++) { - memcpy(tmpOutputPtr, biasPtr, biasTileSize); - tmpOutputPtr += biasTileSize; + for (U32 n = 0; n < on; ++n) { + U8 *biasPtr = (U8 *)bias; + for (U32 c = 0; c < oc / 8; c++, biasPtr += biasTileSize) { + for (U32 hw = 0; hw < oh * ow; hw++) { + UNI_MEMCPY(tmpOutputPtr, biasPtr, biasTileSize); + tmpOutputPtr += biasTileSize; + } } } deconvolution_overlap_crop_c8_x86(convOut, output, inputDesc, outputDesc, convParamSpec); diff --git a/compute/tensor/src/cpu/x86/depthwise_convolution.cpp b/compute/tensor/src/cpu/x86/depthwise_convolution.cpp index 3a21f766..5c6d5e1f 100644 --- a/compute/tensor/src/cpu/x86/depthwise_convolution.cpp +++ b/compute/tensor/src/cpu/x86/depthwise_convolution.cpp @@ -39,6 +39,7 @@ EE depthwise_convolution_transform_filter_x86(TensorDesc filterDesc, } EE depthwise_convolution_infer_forward_tmp_bytes_x86(TensorDesc inputDesc, + TensorDesc dwFilterDesc, TensorDesc outputDesc, ConvolutionParamSpec convParamSpec, DepthwiseConvolutionForwardAlgorithm algorithm, @@ -47,16 +48,18 @@ EE depthwise_convolution_infer_forward_tmp_bytes_x86(TensorDesc inputDesc, if (nullptr == bytes) { CHECK_STATUS(NULL_POINTER); } - DataType idt, odt; - DataFormat idf, odf; + DataType idt, odt, fdt; + DataFormat idf, odf, fdf; U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; U32 on, oc, oh, ow; CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + CHECK_STATUS(tensor4dGet(outputDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 ih_pad = ih + paddingT + paddingB; U32 iw_pad = iw + paddingL + paddingR; @@ -66,7 +69,7 @@ EE depthwise_convolution_infer_forward_tmp_bytes_x86(TensorDesc inputDesc, *bytes = ic * ih_pad * iw_pad; break; case DEPTHWISE_POINTWISE_CONVOLUTION_ALGORITHM_DIRECT: - *bytes = ic * ih_pad * iw_pad + ic * oh * ow; + *bytes = ic * ih_pad * (iw_pad + 4) + ic * oh * ow + ic * 4; break; default: { ret = NOT_MATCH; @@ -74,6 +77,9 @@ EE depthwise_convolution_infer_forward_tmp_bytes_x86(TensorDesc inputDesc, break; } } + if (idt == DT_I8 || idt == DT_U8_Q) { + *bytes += fh * fw * 16 * 16; + } *bytes *= bytesOf(idt); if (idf != DF_NCHWC8) { *bytes += tensorNumBytes(inputDesc); @@ -88,6 +94,7 @@ EE depthwise_convolution_x86(TensorDesc inputDesc, const void *filter, ConvolutionParamSpec convParamSpec, DepthwiseConvolutionForwardAlgorithm algorithm, + void *scale, TensorDesc biasDesc, const void *bias, U32 tmpBytes, @@ -100,7 +107,7 @@ EE depthwise_convolution_x86(TensorDesc inputDesc, TensorDesc blankTensorDesc; ActivationParamSpec blankActivationParamSpec; return depthwise_pointwise_convolution_x86(inputDesc, input, nullptr, filterDesc, filter, - blankTensorDesc, nullptr, convParamSpec, algorithm, blankTensorDesc, bias, biasDesc, + blankTensorDesc, nullptr, convParamSpec, algorithm, nullptr, blankTensorDesc, bias, biasDesc, nullptr, tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, blankActivationParamSpec, arch); } diff --git a/compute/tensor/src/cpu/x86/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/x86/depthwise_pointwise_convolution.cpp index f65e1606..60b332b3 100644 --- a/compute/tensor/src/cpu/x86/depthwise_pointwise_convolution.cpp +++ b/compute/tensor/src/cpu/x86/depthwise_pointwise_convolution.cpp @@ -15,6 +15,9 @@ #ifdef _USE_FP32 #include "cpu/x86/fp32/tensor_computing_fp32.h" #endif +#ifdef _USE_INT8 +#include "cpu/x86/int8/tensor_computing_int8.h" +#endif #include "tensor_transpose.h" EE depthwise_pointwise_convolution_transform_filter_x86(TensorDesc dwFilterDesc, @@ -36,6 +39,14 @@ EE depthwise_pointwise_convolution_transform_filter_x86(TensorDesc dwFilterDesc, (F32 *)dwFilterTransformed, pwFtmDesc, (F32 *)pwFilterTransformed); break; } +#endif +#ifdef _USE_INT8 + case DT_I8: { + ret = depthwise_pointwise_convolution_transform_filter_int8(dwFilterDesc, + (INT8 *)dwFilter, pwFilterDesc, (INT8 *)pwFilter, algorithm, dwFtmDesc, + (INT8 *)dwFilterTransformed, pwFtmDesc, (INT8 *)pwFilterTransformed); + break; + } #endif default: ret = NOT_SUPPORTED; @@ -53,6 +64,7 @@ EE depthwise_pointwise_convolution_x86(TensorDesc inputDesc, const void *pwFilter, ConvolutionParamSpec convParamSpec, DepthwiseConvolutionForwardAlgorithm algorithm, + void *scale, TensorDesc dwBiasDesc, const void *dwBias, TensorDesc pwBiasDesc, @@ -67,22 +79,38 @@ EE depthwise_pointwise_convolution_x86(TensorDesc inputDesc, { TensorDesc newInputDesc = inputDesc; void *newInput = input; - if (inputDesc.df != DF_NCHWC8) { - newInputDesc.df = DF_NCHWC8; + DataFormat dstF = inputDesc.df; + if (inputDesc.dt == DT_U8_Q || inputDesc.df == DF_NCHWC16) { + dstF = DF_NCHWC16; + } else { + dstF = DF_NCHWC8; + } + if (inputDesc.df != dstF) { + newInputDesc.df = dstF; newInput = tmp; tmp = (U8 *)tmp + tensorNumBytes(inputDesc); tmpBytes -= tensorNumBytes(inputDesc); - transformNCHWToNCHWC8(inputDesc, input, newInputDesc, newInput); + transformFormat(inputDesc, input, newInputDesc, newInput); } EE ret = SUCCESS; switch (dwFilterDesc.dt) { #ifdef _USE_FP32 case DT_F32: { - ret = depthwise_pointwise_convolution_fp32(newInputDesc, (F32 *)newInput, - (F32 *)eltwiseInput, dwFilterDesc, (const F32 *)dwFilter, pwFilterDesc, - (const F32 *)pwFilter, convParamSpec, algorithm, dwBiasDesc, (const F32 *)dwBias, - pwBiasDesc, (const F32 *)pwBias, tmpBytes, tmp, outputDesc, (F32 *)output, - depthwiseActivationParamSpec, pointwiseActivationParamSpec, arch); + ret = depthwise_pointwise_convolution_fp32(newInputDesc, (F32 *)newInput, (F32 *)eltwiseInput, dwFilterDesc, + (const F32 *)dwFilter, pwFilterDesc, (const F32 *)pwFilter, convParamSpec, + algorithm, dwBiasDesc, (const F32 *)dwBias, pwBiasDesc, (const F32 *)pwBias, + tmpBytes, tmp, outputDesc, (F32 *)output, depthwiseActivationParamSpec, + pointwiseActivationParamSpec, arch); + break; + } +#endif +#ifdef _USE_INT8 + case DT_I8: { + ret = depthwise_pointwise_convolution_int8(newInputDesc, (UINT8 *)newInput, (F32 *)eltwiseInput, dwFilterDesc, + (const INT8 *)dwFilter, pwFilterDesc, (const INT8 *)pwFilter, convParamSpec, + dwBiasDesc, (const F32 *)dwBias, pwBiasDesc, (const F32 *)pwBias, + tmpBytes, tmp, outputDesc, (void *)output, (F32 *)scale, depthwiseActivationParamSpec, + pointwiseActivationParamSpec); break; } #endif diff --git a/compute/tensor/src/cpu/x86/eltwise.cpp b/compute/tensor/src/cpu/x86/eltwise.cpp index f11fbd87..2e18d9dd 100644 --- a/compute/tensor/src/cpu/x86/eltwise.cpp +++ b/compute/tensor/src/cpu/x86/eltwise.cpp @@ -11,7 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include #include "cpu/x86/tensor_computing_x86.h" #ifdef _USE_FP32 #include "cpu/x86/fp32/tensor_computing_fp32.h" @@ -25,7 +24,7 @@ EE eltwise_x86(DataType dataType, void *output, EltwiseMode eltwiseMode) { - EE ret = SUCCESS; + EE ret = NOT_SUPPORTED; switch (dataType) { #ifdef _USE_FP32 case DT_F32: { @@ -33,6 +32,7 @@ EE eltwise_x86(DataType dataType, break; } #endif + case DT_U32: case DT_I32: { ret = eltwise_i32(input, inputSize, num, len, output, eltwiseMode); break; @@ -42,7 +42,6 @@ EE eltwise_x86(DataType dataType, break; } default: - ret = NOT_SUPPORTED; break; } return ret; diff --git a/compute/tensor/src/cpu/x86/fp32/attention.cpp b/compute/tensor/src/cpu/x86/fp32/attention.cpp index dab532ec..ddb60c0a 100644 --- a/compute/tensor/src/cpu/x86/fp32/attention.cpp +++ b/compute/tensor/src/cpu/x86/fp32/attention.cpp @@ -25,14 +25,14 @@ EE attention_fp32(U32 batch, } F32 mask_s = -10000.0; - I32 count = array_sum_f32(input, toSequenceLength); - I32 valid = UNI_MIN(count, fromSequenceLength); __m256 mask_v = _mm256_set1_ps(mask_s); __m256 one_v = _mm256_set1_ps(1.0); for (U32 n = 0; n < batch; n++) { + U32 count = array_sum_f32(input, toSequenceLength); + U32 valid = UNI_MIN(count, (U32)fromSequenceLength); for (U32 i = 0; i < numHeads; i++) { if (i == 0) { - for (I32 j = 0; j < valid; j++) { + for (U32 j = 0; j < valid; j++) { if (j == 0) { I32 k = 0; for (; k < toSequenceLength - 7; k += 8) { @@ -46,12 +46,12 @@ EE attention_fp32(U32 batch, output[k] = value; } } else { - memcpy( + UNI_MEMCPY( output + j * toSequenceLength, output, toSequenceLength * sizeof(F32)); } } - for (I32 j = valid; j < fromSequenceLength; j++) { + for (U32 j = valid; j < (U32)fromSequenceLength; j++) { if (j == valid) { I32 k = 0; for (; k < toSequenceLength - 7; k += 8) { @@ -61,12 +61,12 @@ EE attention_fp32(U32 batch, output[j * toSequenceLength + k] = mask_s; } } else { - memcpy(output + j * toSequenceLength, output + valid * toSequenceLength, + UNI_MEMCPY(output + j * toSequenceLength, output + valid * toSequenceLength, toSequenceLength * sizeof(F32)); } } } else { - memcpy(output + i * fromSequenceLength * toSequenceLength, output, + UNI_MEMCPY(output + i * fromSequenceLength * toSequenceLength, output, fromSequenceLength * toSequenceLength * sizeof(F32)); } } diff --git a/compute/tensor/src/cpu/x86/fp32/attention_mask.cpp b/compute/tensor/src/cpu/x86/fp32/attention_mask.cpp index 9d683bca..2f4c3bb2 100644 --- a/compute/tensor/src/cpu/x86/fp32/attention_mask.cpp +++ b/compute/tensor/src/cpu/x86/fp32/attention_mask.cpp @@ -11,7 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include #include "cpu/x86/fp32/tensor_computing_fp32.h" EE attention_mask_fp32(TensorDesc inputDesc, @@ -56,7 +55,7 @@ EE attention_mask_fp32(TensorDesc inputDesc, if (start + loops > klen) { loops = UNI_MAX(klen - start, 0); } - memset(&mask[i * klen + start], 0, sizeof(F32) * loops); + UNI_MEMSET(&mask[i * klen + start], 0, sizeof(F32) * loops); } } I32 loops = tensorNumElements(inputDesc) / length; diff --git a/compute/tensor/src/cpu/x86/fp32/check.cpp b/compute/tensor/src/cpu/x86/fp32/check.cpp deleted file mode 100644 index 9140fe00..00000000 --- a/compute/tensor/src/cpu/x86/fp32/check.cpp +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. - -// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), -// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, -// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE -// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR -// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -#include "cpu/x86/fp32/tensor_computing_fp32.h" -#include "x86_avx2_expand.h" - -EE check_fp32(TensorDesc inputDescA, - const F32 *inputA, - TensorDesc inputDescB, - const F32 *inputB, - CheckMode checkMode, - TensorDesc outputDesc, - I32 *output) -{ - if (nullptr == inputA || nullptr == inputB || nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } - - if (tensorNumElements(inputDescA) != tensorNumElements(inputDescB)) { - CHECK_STATUS(NOT_MATCH); - } - - U32 size = tensorNumElements(inputDescA); - U32 loopOuter = inputDescA.dims[inputDescA.nDims - 1]; - I32 length = size / loopOuter; - if (tensorNumElements(outputDesc) != loopOuter) { - CHECK_STATUS(NOT_MATCH); - } - for (U32 j = 0; j < loopOuter; j++) { - const F32 *arrayA = inputA + j * length; - const F32 *arrayB = inputB + j * length; - switch (checkMode) { - case CHECK_GREAT: { - __m256i count_v = _mm256_set1_epi32(0); - I32 i = 0; - for (; i < length - 7; i += 8) { - __m256 a = _mm256_loadu_ps(arrayA + i); - __m256 b = _mm256_loadu_ps(arrayA + i); - count_v = _mm256_add_epi32( - count_v, _mm256_cvtps_epi32(_mm256_cmp_ps(a, b, _CMP_GT_OS))); - } - I32 count = _mm256_hadd_u32(count_v); - for (; i < length; i++) { - if (arrayA[i] > arrayB[i]) { - count++; - } - } - output[j] = (count == length); - break; - } - case CHECK_GREATEQUAL: { - __m256i count_v = _mm256_set1_epi32(0); - I32 i = 0; - for (; i < length - 7; i += 8) { - __m256 a = _mm256_loadu_ps(arrayA + i); - __m256 b = _mm256_loadu_ps(arrayA + i); - count_v = _mm256_add_epi32( - count_v, _mm256_cvtps_epi32(_mm256_cmp_ps(a, b, _CMP_GE_OS))); - } - I32 count = _mm256_hadd_u32(count_v); - for (; i < length; i++) { - if (arrayA[i] >= arrayB[i]) { - count++; - } - } - output[j] = (count == length); - break; - } - case CHECK_EQUAL: { - __m256i count_v = _mm256_set1_epi32(0); - I32 i = 0; - for (; i < length - 7; i += 8) { - __m256 a = _mm256_loadu_ps(arrayA + i); - __m256 b = _mm256_loadu_ps(arrayA + i); - count_v = _mm256_add_epi32( - count_v, _mm256_cvtps_epi32(_mm256_cmp_ps(a, b, _CMP_EQ_OS))); - } - I32 count = _mm256_hadd_u32(count_v); - for (; i < length; i++) { - if (arrayA[i] == arrayB[i]) { - count++; - } - } - output[j] = (count == length); - break; - } - default: - CHECK_STATUS(NOT_SUPPORTED); - break; - } - } - return SUCCESS; -} diff --git a/compute/tensor/src/cpu/x86/fp32/convolution.cpp b/compute/tensor/src/cpu/x86/fp32/convolution.cpp index fb4ea453..c6a31782 100644 --- a/compute/tensor/src/cpu/x86/fp32/convolution.cpp +++ b/compute/tensor/src/cpu/x86/fp32/convolution.cpp @@ -12,8 +12,6 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "sys.h" -#include "error.h" - #include "cpu/x86/fp32/tensor_computing_fp32.h" EE convolution_infer_forward_tmp_bytes_fp32(TensorDesc inputDesc, @@ -34,10 +32,10 @@ EE convolution_infer_forward_tmp_bytes_fp32(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 ih_pad = ih + paddingT + paddingB; U32 iw_pad = iw + paddingL + paddingR; @@ -64,6 +62,13 @@ EE convolution_infer_forward_tmp_bytes_fp32(TensorDesc inputDesc, case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: *bytes = 0; break; + case CONVOLUTION_ALGORITHM_WINOGRAD: { + U32 wSize = 3; + U32 blockIcDim = 32; + U32 blockOcDim = 32; + *bytes = 36 * blockIcDim * ((ow + 3) / 4 + 1) + (36 * blockOcDim + 36 * 36) * wSize; + break; + } default: ret = NOT_MATCH; break; @@ -134,6 +139,10 @@ EE convolution_fp32(TensorDesc inputDesc, ret = convolution_direct_nchw(inputDesc, input, filterDesc, filter, convParamSpec, biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); break; + case CONVOLUTION_ALGORITHM_WINOGRAD: + ret = convolution_winograd(inputDesc, input, eltwiseInput, filterDesc, filter, convParamSpec, + biasDesc, bias, tmpBytes, tmp, outputDesc, output, activationDesc); + break; default: ret = NOT_SUPPORTED; break; diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_1x1_direct.cpp b/compute/tensor/src/cpu/x86/fp32/convolution_1x1_direct.cpp index 265fb722..eb0a3b41 100644 --- a/compute/tensor/src/cpu/x86/fp32/convolution_1x1_direct.cpp +++ b/compute/tensor/src/cpu/x86/fp32/convolution_1x1_direct.cpp @@ -1752,6 +1752,10 @@ EE convolution_1x1_direct(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + if (idf == DF_NCHWC16 && ih == 1 && iw == 1) { + idf = DF_NCHWC8; + } + if ((fdf != DF_NCHWCxN24 && fdf != DF_NCHWCxN32) || (idf != DF_NCHWC8) || (ic % 8 != 0)) { CHECK_STATUS(NOT_MATCH); } @@ -1765,13 +1769,16 @@ EE convolution_1x1_direct(TensorDesc inputDesc, I32 unrollHwArray[4] = {12, 6, 4, 3}; // get computing params - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; + U32 phT = (paddingT + strideH - 1) / strideH; + U32 phB = (paddingB + strideH - 1) / strideH; U32 ohow = oh * ow; + U32 ohowMain = (oh - phT - phB) * ow; U32 ihiw = ih * iw; U32 newIh = (ih + strideH - 1) / strideH; U32 newIw = (iw + strideW - 1) / strideW; @@ -1783,16 +1790,16 @@ EE convolution_1x1_direct(TensorDesc inputDesc, U32 ocBlockNums = InferConvDirectOcBlockNum(oc, ocbArray, unrollOc, unrollOcArray); U32 ocBBlockNums = BLOCK_OC_DIM / unrollOc; U32 alpha = OMP_NUM_THREADS / gcd(ocBlockNums, OMP_NUM_THREADS); - U32 blockHwDim = InferConvBlockHW(ohow, BLOCK_HW_DIM, alpha); + U32 blockHwDim = InferConvBlockHW(ohowMain, BLOCK_HW_DIM, alpha); blockHwDim = (blockHwDim + unrollHwX - 1) / unrollHwX * unrollHwX; - U32 hwBlockNums = CeilDivide(ohow, blockHwDim); - if (paddingT != 0 || paddingB != 0 || paddingL != 0 || paddingR != 0) { + U32 hwBlockNums = CeilDivide(ohowMain, blockHwDim); + if (paddingL != 0 || paddingR != 0) { hwBlockNums = oh; } -#if defined(_WIN32) && defined(_USE_OPENMP) +#ifdef _USE_OPENMP OpenMPController ompCtr; - ompCtr.checkAndSetOpenMP(ohow, BLOCK_HW_DIM, ocBlockNums); + ompCtr.checkAndSetOpenMP(ohowMain, BLOCK_HW_DIM, ocBlockNums); #endif // infer kernel params @@ -1831,7 +1838,7 @@ EE convolution_1x1_direct(TensorDesc inputDesc, } #ifdef _USE_OPENMP -#pragma omp parallel num_threads(OMP_NUM_THREADS) +#pragma omp parallel num_threads(OMP_NUM_THREADS) if (ompCtr.useOmp) { #endif F32 *tmpI = inArray; @@ -1844,14 +1851,15 @@ EE convolution_1x1_direct(TensorDesc inputDesc, #ifdef _USE_OPENMP #pragma omp for schedule(static) #endif - for (U32 hc = 0; hc < ih * ic8; hc += strideH) { - U32 c = hc / ih; - U32 h = hc % ih; - for (U32 w = 0; w < iw; w += strideW) { - U32 nh = h / strideH; - U32 nw = w / strideW; - memcpy(tmpI + c * newIw * newIh * SIMDW + (nh * newIw + nw) * SIMDW, - bInArray + c * ihiw * SIMDW + (h * iw + w) * SIMDW, SIMDW * sizeof(F32)); + for (U32 hc = 0; hc < oh * ic8; ++hc) { + U32 c = hc / oh; + U32 h = hc % oh; + for (U32 w = 0; w < ow; ++w) { + U32 nh = h * strideH; + U32 nw = w * strideW; + UNI_MEMCPY(tmpI + c * ohow * SIMDW + (h * ow + w) * SIMDW, + bInArray + c * ihiw * SIMDW + (nh * iw + nw) * SIMDW, + SIMDW * sizeof(F32)); } } paddingT = (paddingT + strideH - 1) / strideH; @@ -1875,14 +1883,26 @@ EE convolution_1x1_direct(TensorDesc inputDesc, } F32 *curI = tmpI + icb * newIw * newIh; - if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + if (phT > 0 || phB > 0) { + U32 minUpper = UNI_MIN((ocbb + ocbSize) * unrollOc, oc); + for (U32 oci = ocbb * unrollOc; oci < minUpper; oci += SIMDW) { + __m256 biasVec = _mm256_load_ps(btmp + oci); + for (U32 hw = 0; hw < phT * ow; ++hw) { + _mm256_storeu_ps(bOutArray + oci * ohow + hw * SIMDW, biasVec); + } + for (U32 hw = (oh - phB) * ow; hw < oh * ow; ++hw) { + _mm256_storeu_ps(bOutArray + oci * ohow + hw * SIMDW, biasVec); + } + } + } + if (paddingL == 0 && paddingR == 0) { #ifdef _USE_OPENMP #pragma omp for schedule(static) #endif for (U32 bIdx = 0; bIdx < hwocBlockNums; ++bIdx) { FTZ; U32 hw = (bIdx / ocbSize) * blockHwDim; - U32 hwSize = UNI_MIN(blockHwDim, ohow - hw); + U32 hwSize = UNI_MIN(blockHwDim, ohowMain - hw); U32 ocBlockIdx = bIdx % ocbSize + ocbb; U32 ocb = GetOcIdx(ocBlockIdx, oc, unrollOc, ocbArray); U32 ocSize = UNI_MIN(unrollOc, oc - ocb); @@ -1891,8 +1911,8 @@ EE convolution_1x1_direct(TensorDesc inputDesc, const F32 *curB = biasArray + ocb; const F32 *curW = filterArray + ocb * ic + icb * ocSize; - F32 *curO = bOutArray + ocb * oh * ow; - F32 *curE = eltwiseInput + ocb * oh * ow; + F32 *curO = bOutArray + ocb * oh * ow + phT * ow * SIMDW; + F32 *curE = eltwiseInput + ocb * oh * ow + phT * ow * SIMDW; U32 ihwSize = 0; for (U32 ihw = hw; ihw < hw + hwSize; ihw += ihwSize) { if ((hw + hwSize - ihw) >= unrollHw) { @@ -1913,7 +1933,7 @@ EE convolution_1x1_direct(TensorDesc inputDesc, #endif for (U32 bIdx = 0; bIdx < hwocBlockNums; ++bIdx) { FTZ; - U32 h = bIdx / ocbSize; + U32 h = bIdx / ocbSize + phT; U32 ocBlockIdx = bIdx % ocbSize + ocbb; U32 ocb = GetOcIdx(ocBlockIdx, oc, unrollOc, ocbArray); U32 ocSize = UNI_MIN(unrollOc, oc - ocb); @@ -1952,9 +1972,6 @@ EE convolution_1x1_direct(TensorDesc inputDesc, } #ifdef _USE_OPENMP } -#ifdef _WIN32 - ompCtr.resetOpenMP(); -#endif #endif return SUCCESS; } diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_direct.cpp b/compute/tensor/src/cpu/x86/fp32/convolution_direct.cpp index ce60d335..3cb454d2 100644 --- a/compute/tensor/src/cpu/x86/fp32/convolution_direct.cpp +++ b/compute/tensor/src/cpu/x86/fp32/convolution_direct.cpp @@ -1950,10 +1950,10 @@ EE convolution_direct(TensorDesc inputDesc, // get computing params I32 strideH = convParamSpec.stride_h; I32 strideW = convParamSpec.stride_w; - I32 paddingT = convParamSpec.padding_top; - I32 paddingB = convParamSpec.padding_bottom; - I32 paddingL = convParamSpec.padding_left; - I32 paddingR = convParamSpec.padding_right; + I32 paddingT = convParamSpec.pad_top; + I32 paddingB = convParamSpec.pad_bottom; + I32 paddingL = convParamSpec.pad_left; + I32 paddingR = convParamSpec.pad_right; I32 dilateH = convParamSpec.dilatedRate_h; I32 dilateW = convParamSpec.dilatedRate_w; I32 ih_pad = ih + paddingT + paddingB; @@ -1975,7 +1975,7 @@ EE convolution_direct(TensorDesc inputDesc, I32 hwocBlockNums = hwBlockNums * ocBlockNums; I32 blockIcDim = InferConvDirectBolckIcDim(BLOCK_IC_DIM, unrollOc, blockHwDim, fh, fw); -#if defined(_WIN32) && defined(_USE_OPENMP) +#ifdef _USE_OPENMP OpenMPController ompCtr; ompCtr.checkAndSetOpenMP(ohow, BLOCK_HW_DIM, ocBlockNums); #endif @@ -1992,12 +1992,12 @@ EE convolution_direct(TensorDesc inputDesc, if (idf == DF_NCHWC8 && paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { tmpInput = bInArray; } else { - // TODO: optimize the memcpy + // TODO: optimize the UNI_MEMCPY PaddingNCHWC8(bInArray, tmpInput, inputDesc, convParamSpec); } #ifdef _USE_OPENMP -#pragma omp parallel num_threads(OMP_NUM_THREADS) +#pragma omp parallel num_threads(OMP_NUM_THREADS) if (ompCtr.useOmp) { #endif I32 flags = 0; @@ -2050,9 +2050,5 @@ EE convolution_direct(TensorDesc inputDesc, #endif } -#if defined(_WIN32) && defined(_USE_OPENMP) - ompCtr.resetOpenMP(); -#endif - return SUCCESS; } diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_direct_nchw.cpp b/compute/tensor/src/cpu/x86/fp32/convolution_direct_nchw.cpp index ae20dde8..a4036216 100644 --- a/compute/tensor/src/cpu/x86/fp32/convolution_direct_nchw.cpp +++ b/compute/tensor/src/cpu/x86/fp32/convolution_direct_nchw.cpp @@ -1671,10 +1671,10 @@ EE convolution_direct_nchw(TensorDesc inputDesc, I32 fhDilated = (fh - 1) * dilateH + 1; I32 fwDilated = (fw - 1) * dilateW + 1; //pad - I32 paddingT = convParamSpec.padding_top; - I32 paddingB = convParamSpec.padding_bottom; - I32 paddingL = convParamSpec.padding_left; - I32 paddingR = convParamSpec.padding_right; + I32 paddingT = convParamSpec.pad_top; + I32 paddingB = convParamSpec.pad_bottom; + I32 paddingL = convParamSpec.pad_left; + I32 paddingR = convParamSpec.pad_right; I32 ohPaddingT = 0; I32 ohPaddingB = 0; if ((paddingL == 0) && (paddingR == 0) && (paddingT != 0 || paddingB != 0)) { diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_functions.h b/compute/tensor/src/cpu/x86/fp32/convolution_functions.h index 71803714..75baf369 100644 --- a/compute/tensor/src/cpu/x86/fp32/convolution_functions.h +++ b/compute/tensor/src/cpu/x86/fp32/convolution_functions.h @@ -175,19 +175,17 @@ T gcd(T u, T v) return u; } -#if defined(_WIN32) && defined(_USE_OPENMP) +#ifdef _USE_OPENMP struct OpenMPController { - I32 ompThread; + bool useOmp; void checkAndSetOpenMP(I32 ohow, I32 threshold, I32 blockNums) { - ompThread = OMP_NUM_THREADS; +#ifdef _WIN32 if (ohow < threshold && blockNums < OMP_NUM_THREADS) { - OMP_NUM_THREADS = 1; + useOmp = false; } +#endif } - void resetOpenMP() - { - OMP_NUM_THREADS = ompThread; - } + OpenMPController(): useOmp(true) {} }; #endif \ No newline at end of file diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_transform.cpp b/compute/tensor/src/cpu/x86/fp32/convolution_transform.cpp index f5e0df69..8cd7b7f5 100644 --- a/compute/tensor/src/cpu/x86/fp32/convolution_transform.cpp +++ b/compute/tensor/src/cpu/x86/fp32/convolution_transform.cpp @@ -16,6 +16,13 @@ #include "cpu/x86/fp32/transform_functions_fp32.h" #include "cpu/x86/fp32/convolution_functions.h" +EE convolution_winograd_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed); + // N is 32/24 template inline EE transformNCHWToNCHWCxNxWrapper( @@ -54,7 +61,7 @@ inline EE convolution_transform_filter_kernel_fp32(TensorDesc filterDesc, CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); if (fdf == ftmDataFormat) { *ftmDesc = filterDesc; - memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); + UNI_MEMCPY(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); return SUCCESS; } if (fdf != DF_NCHW) { @@ -89,6 +96,11 @@ EE convolution_transform_filter_fp32(TensorDesc filterDesc, TensorDesc *ftmDesc, F32 *filterTransformed) { + if (algorithm == CONVOLUTION_ALGORITHM_WINOGRAD) { + return convolution_winograd_transform_filter_fp32( + filterDesc, filter, convParamSpec, algorithm, ftmDesc, filterTransformed); + } + DataFormat ftmDataFormat; DataType fdt; DataFormat fdf; @@ -133,3 +145,112 @@ EE convolution_transform_filter_fp32(TensorDesc filterDesc, ftmDesc->dims[channelAxis] = filterDesc.dims[channelAxis]; return SUCCESS; } + +void transformWeight4x4_3x3( + const F32 *input, F32 *output, F32 *tmp, U32 blockIc, TensorDesc filterDesc) +{ + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + + __m256 v01666 = _mm256_set1_ps(0.1666666666666667f); + __m256 minusV01666 = _mm256_set1_ps(-0.1666666666666667f); + __m256 v00833 = _mm256_set1_ps(0.0833333333333333f); + __m256 minusV00833 = _mm256_set1_ps(-0.0833333333333333f); + __m256 v004166 = _mm256_set1_ps(0.0416666666666667f); + __m256 v025 = _mm256_set1_ps(0.25f); + + // U32 fn32 = fn / 32; + U32 fnBlocks[3] = {8, 16, 32}; + U32 lstep = fc * fh * fw; + __m256i vindex = _mm256_set_epi32( + lstep * 7, lstep * 6, lstep * 5, lstep * 4, lstep * 3, lstep * 2, lstep, 0); + + U32 cx = 0; + for (U32 c = 0; c < fc; c += cx) { + cx = UNI_MIN(blockIc, fc - c); + U32 nSize = 0; + for (U32 n = 0; n < fn; n += nSize) { + nSize = UNI_MIN(32, fn - n); + nSize = fnBlocks[nSize >> 4]; + F32 *curO = output + (c * fn + n * cx) * 36; + for (U32 cb = 0; cb < cx; ++cb) { + for (U32 ni = 0; ni < (nSize / 8); ++ni) { + const F32 *curI = input + (n + ni * 8) * lstep + (c + cb) * fh * fw; + for (U32 i = 0; i < 3; ++i) { + __m256 xi0 = _mm256_i32gather_ps(curI + i, vindex, 4); + __m256 xi1 = _mm256_i32gather_ps(curI + 3 + i, vindex, 4); + __m256 xi2 = _mm256_i32gather_ps(curI + 3 * 2 + i, vindex, 4); + + __m256 t0 = _mm256_mul_ps(v01666, xi2); + __m256 t1 = _mm256_sub_ps(_mm256_mul_ps(minusV01666, xi0), t0); + __m256 t2 = _mm256_fmadd_ps(v004166, xi0, t0); + + __m256 o0 = _mm256_mul_ps(v025, xi0); + __m256 o1 = _mm256_fmadd_ps(xi1, minusV01666, t1); + __m256 o2 = _mm256_fmadd_ps(xi1, v01666, t1); + __m256 o3 = _mm256_fmadd_ps(xi1, v00833, t2); + __m256 o4 = _mm256_fmadd_ps(xi1, minusV00833, t2); + + _mm256_storeu_ps(tmp + (i)*8, o0); + _mm256_storeu_ps(tmp + (3 + i) * 8, o1); + _mm256_storeu_ps(tmp + (3 * 2 + i) * 8, o2); + _mm256_storeu_ps(tmp + (3 * 3 + i) * 8, o3); + _mm256_storeu_ps(tmp + (3 * 4 + i) * 8, o4); + _mm256_storeu_ps(tmp + (3 * 5 + i) * 8, xi2); + } + for (U32 i = 0; i < 6; ++i) { + __m256 xi0 = _mm256_loadu_ps(tmp + (3 * i) * 8); + __m256 xi1 = _mm256_loadu_ps(tmp + (3 * i + 1) * 8); + __m256 xi2 = _mm256_loadu_ps(tmp + (3 * i + 2) * 8); + + __m256 t0 = _mm256_mul_ps(v01666, xi2); + __m256 t1 = _mm256_sub_ps(_mm256_mul_ps(minusV01666, xi0), t0); + __m256 t2 = _mm256_fmadd_ps(v004166, xi0, t0); + + __m256 o0 = _mm256_mul_ps(v025, xi0); + __m256 o1 = _mm256_fmadd_ps(xi1, minusV01666, t1); + __m256 o2 = _mm256_fmadd_ps(xi1, v01666, t1); + __m256 o3 = _mm256_fmadd_ps(xi1, v00833, t2); + __m256 o4 = _mm256_fmadd_ps(xi1, minusV00833, t2); + + _mm256_storeu_ps(curO + (6 * i) * nSize * cx + cb * nSize + ni * 8, o0); + _mm256_storeu_ps(curO + (6 * i + 1) * nSize * cx + cb * nSize + ni * 8, o1); + _mm256_storeu_ps(curO + (6 * i + 2) * nSize * cx + cb * nSize + ni * 8, o2); + _mm256_storeu_ps(curO + (6 * i + 3) * nSize * cx + cb * nSize + ni * 8, o3); + _mm256_storeu_ps(curO + (6 * i + 4) * nSize * cx + cb * nSize + ni * 8, o4); + _mm256_storeu_ps(curO + (6 * i + 5) * nSize * cx + cb * nSize + ni * 8, xi2); + } + } + } + } + } +} + +EE convolution_winograd_transform_filter_fp32(TensorDesc filterDesc, + const F32 *filter, + ConvolutionParamSpec convParamSpec, + ConvolutionForwardAlgorithm algorithm, + TensorDesc *ftmDesc, + F32 *filterTransformed) +{ + // F(4x4, 3x3) + if (nullptr == filter || nullptr == ftmDesc || nullptr == filterTransformed) { + CHECK_STATUS(NULL_POINTER); + } + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + if (fdf != DF_NCHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + + U32 blockIc = UNI_MIN(32, fc); + F32 *tmp = filterTransformed + fn * fc * 36; + transformWeight4x4_3x3(filter, filterTransformed, tmp, blockIc, filterDesc); + *ftmDesc = tensor4df(fdt, DF_NCHWCxN32, fn, fc, fh, fw); + + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/convolution_winograd.cpp b/compute/tensor/src/cpu/x86/fp32/convolution_winograd.cpp new file mode 100644 index 00000000..b6062686 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/convolution_winograd.cpp @@ -0,0 +1,1623 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "sys.h" +#include "error.h" + +#include "cpu/x86/fp32/tensor_computing_fp32.h" +#include "cpu/x86/fp32/transform_functions_fp32.h" +#include "cpu/x86/fp32/convolution_functions.h" + +#define BLOCK_IC_DIM 32 + +void transformInput4x4_3x3( + F32 *input, F32 *output, F32 *tmp, U32 iw, U32 ih, U32 ic, U32 wSize, U32 blockIc) +{ + __m256 four = _mm256_set1_ps(4.0f); + __m256 minusFour = _mm256_set1_ps(-4.0f); + __m256 two = _mm256_set1_ps(2.0f); + __m256 minusFive = _mm256_set1_ps(-5.0f); + U32 icb = ic / blockIc; + U32 cb = blockIc / 8; + for (U32 w = 0; w < wSize; ++w) { + for (U32 c = 0; c < icb; ++c) { + for (U32 cc = 0; cc < cb; ++cc) { + F32 *curI = input + (c * blockIc + cc * 8) * ih * iw + w * 4 * 8; + F32 *curO = output + (w * ic + c * blockIc) * 36 + cc * 8; + for (U32 i = 0; i < 6; ++i) { + __m256 xi0 = _mm256_loadu_ps(curI + (i)*8); + __m256 xi1 = _mm256_loadu_ps(curI + (iw + i) * 8); + __m256 xi2 = _mm256_loadu_ps(curI + (iw * 2 + i) * 8); + __m256 xi3 = _mm256_loadu_ps(curI + (iw * 3 + i) * 8); + __m256 xi4 = _mm256_loadu_ps(curI + (iw * 4 + i) * 8); + __m256 xi5 = _mm256_loadu_ps(curI + (iw * 5 + i) * 8); + + __m256 t0 = _mm256_fmadd_ps(minusFour, xi2, xi4); + __m256 t1 = _mm256_fmadd_ps(minusFour, xi1, xi3); + __m256 t2 = _mm256_sub_ps(xi4, xi2); + __m256 t3 = _mm256_mul_ps(two, _mm256_sub_ps(xi3, xi1)); + __m256 t4 = _mm256_fmadd_ps(four, xi0, xi4); + __m256 t5 = _mm256_fmadd_ps(four, xi1, xi5); + + xi0 = _mm256_fmadd_ps(minusFive, xi2, t4); + xi5 = _mm256_fmadd_ps(minusFive, xi3, t5); + xi1 = _mm256_add_ps(t1, t0); + xi2 = _mm256_sub_ps(t0, t1); + xi3 = _mm256_add_ps(t3, t2); + xi4 = _mm256_sub_ps(t2, t3); + + _mm256_storeu_ps(tmp + (i)*8, xi0); + _mm256_storeu_ps(tmp + (6 + i) * 8, xi1); + _mm256_storeu_ps(tmp + (6 * 2 + i) * 8, xi2); + _mm256_storeu_ps(tmp + (6 * 3 + i) * 8, xi3); + _mm256_storeu_ps(tmp + (6 * 4 + i) * 8, xi4); + _mm256_storeu_ps(tmp + (6 * 5 + i) * 8, xi5); + } + + for (U32 i = 0; i < 6; ++i) { + __m256 xi0 = _mm256_loadu_ps(tmp + (i * 6) * 8); + __m256 xi1 = _mm256_loadu_ps(tmp + (i * 6 + 1) * 8); + __m256 xi2 = _mm256_loadu_ps(tmp + (i * 6 + 2) * 8); + __m256 xi3 = _mm256_loadu_ps(tmp + (i * 6 + 3) * 8); + __m256 xi4 = _mm256_loadu_ps(tmp + (i * 6 + 4) * 8); + __m256 xi5 = _mm256_loadu_ps(tmp + (i * 6 + 5) * 8); + + if (cc % 2 == 0) { + _mm_prefetch(curO + (6 * i) * blockIc, _MM_HINT_NTA); + _mm_prefetch(curO + (6 * i + 1) * blockIc, _MM_HINT_NTA); + _mm_prefetch(curO + (6 * i + 2) * blockIc, _MM_HINT_NTA); + _mm_prefetch(curO + (6 * i + 3) * blockIc, _MM_HINT_NTA); + _mm_prefetch(curO + (6 * i + 4) * blockIc, _MM_HINT_NTA); + _mm_prefetch(curO + (6 * i + 5) * blockIc, _MM_HINT_NTA); + } + __m256 t0 = _mm256_fmadd_ps(minusFour, xi2, xi4); + __m256 t1 = _mm256_fmadd_ps(minusFour, xi1, xi3); + __m256 t2 = _mm256_sub_ps(xi4, xi2); + __m256 t3 = _mm256_mul_ps(two, _mm256_sub_ps(xi3, xi1)); + __m256 t4 = _mm256_fmadd_ps(four, xi0, xi4); + __m256 t5 = _mm256_fmadd_ps(four, xi1, xi5); + + xi0 = _mm256_fmadd_ps(minusFive, xi2, t4); + xi5 = _mm256_fmadd_ps(minusFive, xi3, t5); + xi1 = _mm256_add_ps(t1, t0); + xi2 = _mm256_sub_ps(t0, t1); + xi3 = _mm256_add_ps(t3, t2); + xi4 = _mm256_sub_ps(t2, t3); + + _mm256_storeu_ps(curO + (6 * i) * blockIc, xi0); + _mm256_storeu_ps(curO + (6 * i + 1) * blockIc, xi1); + _mm256_storeu_ps(curO + (6 * i + 2) * blockIc, xi2); + _mm256_storeu_ps(curO + (6 * i + 3) * blockIc, xi3); + _mm256_storeu_ps(curO + (6 * i + 4) * blockIc, xi4); + _mm256_storeu_ps(curO + (6 * i + 5) * blockIc, xi5); + } + } + } + } +} + +void transformInputWithPad4x4_3x3(F32 *input, + F32 *output, + F32 *tmp, + U32 iw, + U32 ih, + U32 ic, + U32 wSize, + U32 blockIc, + U32 pl, + U32 pr, + U32 pt, + U32 pb, + U32 h, + U32 w, + U32 oh, + U32 ow) +{ + __m256 four = _mm256_set1_ps(4.0f); + __m256 minusFour = _mm256_set1_ps(-4.0f); + __m256 two = _mm256_set1_ps(2.0f); + __m256 minusFive = _mm256_set1_ps(-5.0f); + U32 icb = ic / blockIc; + U32 cb = blockIc / 8; + + pt = (h > pt) ? 0 : (pt - h); + pl = (w > pl) ? 0 : (pl - w); + for (U32 uw = 0; uw < wSize; ++uw) { + for (U32 c = 0; c < icb; ++c) { + for (U32 cc = 0; cc < cb; ++cc) { + F32 *curI = input + (c * blockIc + cc * 8) * ih * iw; + F32 *curO = output + (uw * ic + c * blockIc) * 36 + cc * 8; + U32 i = 0; + for (; ((i + w) < pl) && (i < 6); ++i) { + UNI_MEMSET(tmp + (i)*8, 0, 32); + UNI_MEMSET(tmp + (6 + i) * 8, 0, 32); + UNI_MEMSET(tmp + (6 * 2 + i) * 8, 0, 32); + UNI_MEMSET(tmp + (6 * 3 + i) * 8, 0, 32); + UNI_MEMSET(tmp + (6 * 4 + i) * 8, 0, 32); + UNI_MEMSET(tmp + (6 * 5 + i) * 8, 0, 32); + } + for (; ((i + w + pr) < (ow + 2)) && (i < 6); ++i) { + __m256 xi[6]; + U32 b = 0; + for (; ((b + h) < pt) && (b < 6); ++b) { + xi[b] = _mm256_setzero_ps(); + } + for (; ((b + h + pb) < (oh + 2)) && (b < 6); ++b) { + xi[b] = _mm256_loadu_ps(curI + (iw * (b - pt) + i + uw * 4 - pl) * 8); + } + for (; ((b + h) < (oh + 2)) && (b < 6); ++b) { + xi[b] = _mm256_setzero_ps(); + } + + __m256 t0 = _mm256_fmadd_ps(minusFour, xi[2], xi[4]); + __m256 t1 = _mm256_fmadd_ps(minusFour, xi[1], xi[3]); + __m256 t2 = _mm256_sub_ps(xi[4], xi[2]); + __m256 t3 = _mm256_mul_ps(two, _mm256_sub_ps(xi[3], xi[1])); + __m256 t4 = _mm256_fmadd_ps(four, xi[0], xi[4]); + __m256 t5 = _mm256_fmadd_ps(four, xi[1], xi[5]); + + xi[0] = _mm256_fmadd_ps(minusFive, xi[2], t4); + xi[5] = _mm256_fmadd_ps(minusFive, xi[3], t5); + xi[1] = _mm256_add_ps(t1, t0); + xi[2] = _mm256_sub_ps(t0, t1); + xi[3] = _mm256_add_ps(t3, t2); + xi[4] = _mm256_sub_ps(t2, t3); + + _mm256_storeu_ps(tmp + (i)*8, xi[0]); + _mm256_storeu_ps(tmp + (6 + i) * 8, xi[1]); + _mm256_storeu_ps(tmp + (6 * 2 + i) * 8, xi[2]); + _mm256_storeu_ps(tmp + (6 * 3 + i) * 8, xi[3]); + _mm256_storeu_ps(tmp + (6 * 4 + i) * 8, xi[4]); + _mm256_storeu_ps(tmp + (6 * 5 + i) * 8, xi[5]); + } + for (; ((i + w) < (ow + 2)) && (i < 6); ++i) { + UNI_MEMSET(tmp + (i)*8, 0, 32); + UNI_MEMSET(tmp + (6 + i) * 8, 0, 32); + UNI_MEMSET(tmp + (6 * 2 + i) * 8, 0, 32); + UNI_MEMSET(tmp + (6 * 3 + i) * 8, 0, 32); + UNI_MEMSET(tmp + (6 * 4 + i) * 8, 0, 32); + UNI_MEMSET(tmp + (6 * 5 + i) * 8, 0, 32); + } + + for (U32 j = 0; j < 6; ++j) { + __m256 xi0 = _mm256_loadu_ps(tmp + (j * 6) * 8); + __m256 xi1 = _mm256_loadu_ps(tmp + (j * 6 + 1) * 8); + __m256 xi2 = _mm256_loadu_ps(tmp + (j * 6 + 2) * 8); + __m256 xi3 = _mm256_loadu_ps(tmp + (j * 6 + 3) * 8); + __m256 xi4 = _mm256_loadu_ps(tmp + (j * 6 + 4) * 8); + __m256 xi5 = _mm256_loadu_ps(tmp + (j * 6 + 5) * 8); + + if (cc % 2 == 0) { + _mm_prefetch(curO + (6 * j) * blockIc, _MM_HINT_NTA); + _mm_prefetch(curO + (6 * j + 1) * blockIc, _MM_HINT_NTA); + _mm_prefetch(curO + (6 * j + 2) * blockIc, _MM_HINT_NTA); + _mm_prefetch(curO + (6 * j + 3) * blockIc, _MM_HINT_NTA); + _mm_prefetch(curO + (6 * j + 4) * blockIc, _MM_HINT_NTA); + _mm_prefetch(curO + (6 * j + 5) * blockIc, _MM_HINT_NTA); + } + __m256 t0 = _mm256_fmadd_ps(minusFour, xi2, xi4); + __m256 t1 = _mm256_fmadd_ps(minusFour, xi1, xi3); + __m256 t2 = _mm256_sub_ps(xi4, xi2); + __m256 t3 = _mm256_mul_ps(two, _mm256_sub_ps(xi3, xi1)); + __m256 t4 = _mm256_fmadd_ps(four, xi0, xi4); + __m256 t5 = _mm256_fmadd_ps(four, xi1, xi5); + + xi0 = _mm256_fmadd_ps(minusFive, xi2, t4); + xi5 = _mm256_fmadd_ps(minusFive, xi3, t5); + xi1 = _mm256_add_ps(t1, t0); + xi2 = _mm256_sub_ps(t0, t1); + xi3 = _mm256_add_ps(t3, t2); + xi4 = _mm256_sub_ps(t2, t3); + + _mm256_storeu_ps(curO + (6 * j) * blockIc, xi0); + _mm256_storeu_ps(curO + (6 * j + 1) * blockIc, xi1); + _mm256_storeu_ps(curO + (6 * j + 2) * blockIc, xi2); + _mm256_storeu_ps(curO + (6 * j + 3) * blockIc, xi3); + _mm256_storeu_ps(curO + (6 * j + 4) * blockIc, xi4); + _mm256_storeu_ps(curO + (6 * j + 5) * blockIc, xi5); + } + } + } + w += 4; + } +} + +void transformOutput4x4_3x3(F32 *input, + F32 *output, + F32 *tmp, + const F32 *bias, + U32 ow, + U32 oh, + U32 oc, + U32 wSize, + bool addF, + ActivationMode mode) +{ + I64 flag = (I64)addF | (I64(mode) << 1); + __m256 four = _mm256_set1_ps(4.0f); + __m256 eight = _mm256_set1_ps(8.0f); + U32 ocb = oc / 8; + for (U32 c = 0; c < ocb; ++c) { + for (U32 w = 0; w < wSize; ++w) { + F32 *curI = input + w * oc + c * 8; + F32 *curO = output + w * 32 + c * 8 * oh * ow; + I64 stepI = 24 * oc * wSize; + I64 stepT = 192; + for (U32 i = 0; i < 6; ++i) { + F32 *useI0 = curI + i * oc * wSize; + F32 *useI1 = useI0 + 18 * oc * wSize; + F32 *useO0 = tmp + i * 8; + F32 *useO1 = useO0 + 96; + __asm__ __volatile__( + "vmovups (%[input0]), %%ymm0 \n\t" + "vmovups (%[input0], %[stepI]), %%ymm1 \n\t" + "vmovups (%[input0], %[stepI], 2), %%ymm2 \n\t" + "vmovups (%[input1]), %%ymm3 \n\t" + "vmovups (%[input1], %[stepI]), %%ymm4 \n\t" + "vmovups (%[input1], %[stepI], 2), %%ymm5 \n\t" + "vaddps %%ymm2, %%ymm1, %%ymm6 \n\t" + "vaddps %%ymm3, %%ymm4, %%ymm7 \n\t" + "vsubps %%ymm2, %%ymm1, %%ymm8 \n\t" + "vsubps %%ymm4, %%ymm3, %%ymm9 \n\t" + "vaddps %%ymm6, %%ymm7, %%ymm1 \n\t" + "vaddps %%ymm9, %%ymm9, %%ymm3 \n\t" + "vaddps %%ymm0, %%ymm1, %%ymm11 \n\t" // xi0 + "vaddps %%ymm8, %%ymm3, %%ymm12 \n\t" // xi1 + "vmovups %%ymm11, (%[output0]) \n\t" + "vmovups %%ymm12, (%[output0], %[stepT]) \n\t" + "vfmadd231ps %[eight], %%ymm9, %%ymm8 \n\t" + "vfmadd231ps %[four], %%ymm7, %%ymm6 \n\t" + "vaddps %%ymm5, %%ymm8, %%ymm10 \n\t" // xi3 + "vmovups %%ymm6, (%[output1]) \n\t" + "vmovups %%ymm10, (%[output1], %[stepT]) \n\t" + : + : [input0] "r"(useI0), [input1] "r"(useI1), [stepI] "r"(stepI), [output0] "r"(useO0), + [output1] "r"(useO1), [stepT] "r"(stepT), [four] "x"(four), [eight] "x"(eight) + : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", + "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "memory", "cc"); + } + + stepT = 32; + stepI = 32; + for (U32 i = 0; i < 4; ++i) { + F32 *useI0 = tmp + 48 * i; + F32 *useI1 = useI0 + 24; + F32 *useO0 = curO + ow * i * 8; + F32 *useO1 = useO0 + 16; + __asm__ __volatile__( + "vmovups (%[input0]), %%ymm0 \n\t" + "vmovups (%[input0], %[stepI]), %%ymm1 \n\t" + "vmovups (%[input0], %[stepI], 2), %%ymm2 \n\t" + "vmovups (%[input1]), %%ymm3 \n\t" + "vmovups (%[input1], %[stepI]), %%ymm4 \n\t" + "vmovups (%[input1], %[stepI], 2), %%ymm5 \n\t" + "prefetcht0 (%[output0]) \n\t" + "prefetcht0 (%[output1]) \n\t" + "vaddps %%ymm2, %%ymm1, %%ymm6 \n\t" + "vaddps %%ymm3, %%ymm4, %%ymm7 \n\t" + "vsubps %%ymm2, %%ymm1, %%ymm8 \n\t" + "vsubps %%ymm4, %%ymm3, %%ymm9 \n\t" + "vaddps %%ymm6, %%ymm7, %%ymm1 \n\t" + "vaddps %%ymm9, %%ymm9, %%ymm3 \n\t" + "vaddps %%ymm0, %%ymm1, %%ymm11 \n\t" // xi0 + "vaddps %%ymm8, %%ymm3, %%ymm12 \n\t" // xi1 + "vfmadd231ps %[eight], %%ymm9, %%ymm8 \n\t" + "vfmadd231ps %[four], %%ymm7, %%ymm6 \n\t" + "vaddps %%ymm5, %%ymm8, %%ymm10 \n\t" // xi3 + "mov %[flag], %%rax \n\t" + "and $0x1, %%rax \n\t" + "je 0f \n\t" + "vaddps (%[output0]), %%ymm11, %%ymm11 \n\t" + "vaddps (%[output0], %[stepT]), %%ymm12, %%ymm12 \n\t" + "vaddps (%[output1]), %%ymm6, %%ymm6 \n\t" + "vaddps (%[output1], %[stepT]), %%ymm10, %%ymm10 \n\t" + "jmp 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + "vmovups (%[bias]), %%ymm0 \n\t" + "vaddps %%ymm0, %%ymm11, %%ymm11 \n\t" + "vaddps %%ymm0, %%ymm12, %%ymm12 \n\t" + "vaddps %%ymm0, %%ymm6, %%ymm6 \n\t" + "vaddps %%ymm0, %%ymm10, %%ymm10 \n\t" + ".align 16 \n\t" + "1: \n\t" + "mov %[flag], %%rax \n\t" + "or $0x1, %%rax \n\t" + "cmp $0x3, %%rax \n\t" + "jne 2f \n\t" + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vmaxps %%ymm0, %%ymm6, %%ymm6 \n\t" + "vmaxps %%ymm0, %%ymm12, %%ymm12 \n\t" + "vmaxps %%ymm0, %%ymm10, %%ymm10 \n\t" + "vmaxps %%ymm0, %%ymm11, %%ymm11 \n\t" + ".align 16 \n\t" + "2: \n\t" + "vmovups %%ymm11, (%[output0]) \n\t" + "vmovups %%ymm12, (%[output0], %[stepT]) \n\t" + "vmovups %%ymm6, (%[output1]) \n\t" + "vmovups %%ymm10, (%[output1], %[stepT]) \n\t" + : + : [input0] "r"(useI0), [input1] "r"(useI1), [stepI] "r"(stepI), + [output0] "r"(useO0), [output1] "r"(useO1), [stepT] "r"(stepT), + [four] "x"(four), [eight] "x"(eight), [flag] "r"(flag), [bias] "r"(bias) + : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", + "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "memory", "cc"); + } + } + bias += 8; + } +} + +void transformOutputWithPad4x4_3x3(F32 *input, + F32 *output, + F32 *tmp, + const F32 *bias, + U32 ow, + U32 oh, + U32 oc, + U32 wSize, + bool addF, + U32 pr, + U32 pb, + U32 h, + U32 w, + ActivationMode mode) +{ + __m256 two = _mm256_set1_ps(2.0f); + __m256 four = _mm256_set1_ps(4.0f); + __m256 eight = _mm256_set1_ps(8.0f); + U32 ocb = oc / 8; + for (U32 c = 0; c < ocb; ++c) { + for (U32 uw = 0; uw < wSize; ++uw) { + F32 *curI = input + uw * oc + c * 8; + F32 *curO = output + uw * 32 + c * 8 * oh * ow; + I64 stepI = 24 * oc * wSize; + I64 stepT = 192; + for (U32 i = 0; i < 6; ++i) { + F32 *useI0 = curI + i * oc * wSize; + F32 *useI1 = useI0 + 18 * oc * wSize; + F32 *useO0 = tmp + i * 8; + F32 *useO1 = useO0 + 96; + __asm__ __volatile__( + "vmovups (%[input0]), %%ymm0 \n\t" + "vmovups (%[input0], %[stepI]), %%ymm1 \n\t" + "vmovups (%[input0], %[stepI], 2), %%ymm2 \n\t" + "vmovups (%[input1]), %%ymm3 \n\t" + "vmovups (%[input1], %[stepI]), %%ymm4 \n\t" + "vmovups (%[input1], %[stepI], 2), %%ymm5 \n\t" + "vaddps %%ymm2, %%ymm1, %%ymm6 \n\t" + "vaddps %%ymm3, %%ymm4, %%ymm7 \n\t" + "vsubps %%ymm2, %%ymm1, %%ymm8 \n\t" + "vsubps %%ymm4, %%ymm3, %%ymm9 \n\t" + "vaddps %%ymm6, %%ymm7, %%ymm1 \n\t" + "vaddps %%ymm9, %%ymm9, %%ymm3 \n\t" + "vaddps %%ymm0, %%ymm1, %%ymm11 \n\t" // xi0 + "vaddps %%ymm8, %%ymm3, %%ymm12 \n\t" // xi1 + "vmovups %%ymm11, (%[output0]) \n\t" + "vmovups %%ymm12, (%[output0], %[stepT]) \n\t" + "vfmadd231ps %[eight], %%ymm9, %%ymm8 \n\t" + "vfmadd231ps %[four], %%ymm7, %%ymm6 \n\t" + "vaddps %%ymm5, %%ymm8, %%ymm10 \n\t" // xi3 + "vmovups %%ymm6, (%[output1]) \n\t" + "vmovups %%ymm10, (%[output1], %[stepT]) \n\t" + : + : [input0] "r"(useI0), [input1] "r"(useI1), [stepI] "r"(stepI), [output0] "r"(useO0), + [output1] "r"(useO1), [stepT] "r"(stepT), [four] "x"(four), [eight] "x"(eight) + : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", + "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "memory", "cc"); + } + for (U32 i = 0; (i < 4) && (i + h < oh); ++i) { + __m256 xi[6]; + for (U32 j = 0; j < 6; ++j) { + xi[j] = _mm256_loadu_ps(tmp + (6 * i + j) * 8); + } + + __m256 t0 = _mm256_add_ps(xi[1], xi[2]); + __m256 t1 = _mm256_add_ps(xi[4], xi[3]); + __m256 t2 = _mm256_sub_ps(xi[1], xi[2]); + __m256 t3 = _mm256_sub_ps(xi[3], xi[4]); + + xi[0] = _mm256_add_ps(_mm256_add_ps(t0, t1), xi[0]); + xi[1] = _mm256_fmadd_ps(two, t3, t2); + xi[2] = _mm256_fmadd_ps(four, t1, t0); + xi[3] = _mm256_add_ps(_mm256_fmadd_ps(eight, t3, t2), xi[5]); + + if (addF) { + for (U32 j = 0; (j < 4) && (j + w + uw * 4 < ow); ++j) { + xi[j] = _mm256_add_ps(xi[j], + _mm256_loadu_ps(output + (ow * i + uw * 4 + j) * 8 + c * 8 * oh * ow)); + } + } else { + __m256 b = _mm256_loadu_ps(bias + c * 8); + for (U32 j = 0; (j < 4) && (j + w + uw * 4 < ow); ++j) { + xi[j] = _mm256_add_ps(xi[j], b); + } + } + + if (mode) { + __m256 zero = _mm256_setzero_ps(); + for (U32 j = 0; (j < 4) && (j + w + uw * 4 < ow); ++j) { + xi[j] = _mm256_max_ps(xi[j], zero); + } + } + + for (U32 j = 0; (j < 4) && (j + w + uw * 4 < ow); ++j) { + _mm256_storeu_ps(output + (ow * i + uw * 4 + j) * 8 + c * 8 * oh * ow, xi[j]); + } + } + } + } +} + +struct ConvController { + F32 **input; + const F32 *filter; + void *output; + F32 *eltwise; + I64 ic; + I64 fStep; + I64 flags; +}; + +typedef void (*kernelFunc)(ConvController &c); + +void winoKernel3x32(ConvController &c) +{ + __asm__ __volatile__( + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" + "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" + "vxorps %%ymm8, %%ymm8, %%ymm8 \n\t" + "vxorps %%ymm9, %%ymm9, %%ymm9 \n\t" + "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" + "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vbroadcastss (%[input0]), %%ymm12 \n\t" + "vbroadcastss (%[input1]), %%ymm13 \n\t" + "vbroadcastss (%[input2]), %%ymm14 \n\t" + "vmovups 0x0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "prefetcht0 0x100(%[filter]) \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x20(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x40(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "prefetcht0 0x140(%[filter]) \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x60(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x4(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x4(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0x4(%[input2]), %%ymm14 \n\t" + "vmovups 0x80(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "prefetcht0 0x180(%[filter]) \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0xA0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0xC0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "prefetcht0 0x1C0(%[filter]) \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0xE0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x8(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x8(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0x8(%[input2]), %%ymm14 \n\t" + "vmovups 0x100(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "prefetcht0 0x200(%[filter]) \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x120(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x140(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "prefetcht0 0x240(%[filter]) \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x160(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0xC(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0xC(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0xC(%[input2]), %%ymm14 \n\t" + "vmovups 0x180(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "prefetcht0 0x280(%[filter]) \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x1A0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x1C0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "prefetcht0 0x2C0(%[filter]) \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x1E0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x10(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x10(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0x10(%[input2]), %%ymm14 \n\t" + "vmovups 0x200(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "prefetcht0 0x300(%[filter]) \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x220(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x240(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "prefetcht0 0x340(%[filter]) \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x260(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x14(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x14(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0x14(%[input2]), %%ymm14 \n\t" + "vmovups 0x280(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "prefetcht0 0x380(%[filter]) \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x2A0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x2C0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "prefetcht0 0x3C0(%[filter]) \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x2E0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x18(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x18(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0x18(%[input2]), %%ymm14 \n\t" + "vmovups 0x300(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "prefetcht0 0x400(%[filter]) \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x320(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x340(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "prefetcht0 0x440(%[filter]) \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x360(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "vbroadcastss 0x1C(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x1C(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0x1C(%[input2]), %%ymm14 \n\t" + "vmovups 0x380(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "prefetcht0 0x480(%[filter]) \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x3A0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + "vmovups 0x3C0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "prefetcht0 0x4C0(%[filter]) \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm8 \n\t" + "vmovups 0x3E0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm11 \n\t" + + "add $0x20, %[input0] \n\t" + "add $0x20, %[input1] \n\t" + "add $0x20, %[input2] \n\t" + "add $0x400, %[filter] \n\t" + "dec %%rcx \n\t" + "jg 1b \n\t" + + "vmovups %%ymm0, (%[output]) \n\t" + "vmovups %%ymm3, 0x20(%[output]) \n\t" + "vmovups %%ymm6, 0x40(%[output]) \n\t" + "vmovups %%ymm9, 0x60(%[output]) \n\t" + "vmovups %%ymm1, 0x80(%[output]) \n\t" + "vmovups %%ymm4, 0xA0(%[output]) \n\t" + "vmovups %%ymm7, 0xC0(%[output]) \n\t" + "vmovups %%ymm10, 0xE0(%[output]) \n\t" + "vmovups %%ymm2, 0x100(%[output]) \n\t" + "vmovups %%ymm5, 0x120(%[output]) \n\t" + "vmovups %%ymm8, 0x140(%[output]) \n\t" + "vmovups %%ymm11, 0x160(%[output]) \n\t" + : + : [input0] "r"(c.input[0]), [input1] "r"(c.input[1]), [input2] "r"(c.input[2]), + [filter] "r"(c.filter), [output] "r"(c.output), [ic] "c"(c.ic) + : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", + "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", "cc"); +} + +void winoKernel2x32(ConvController &c) +{ + __asm__ __volatile__( + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm7, %%ymm7, %%ymm7 \n\t" + "vxorps %%ymm9, %%ymm9, %%ymm9 \n\t" + "vxorps %%ymm10, %%ymm10, %%ymm10 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vbroadcastss (%[input0]), %%ymm12 \n\t" + "vbroadcastss (%[input1]), %%ymm13 \n\t" + "vmovups 0x0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "prefetcht0 0x100(%[filter]) \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x20(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x40(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "prefetcht0 0x140(%[filter]) \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "vmovups 0x60(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0x4(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x4(%[input1]), %%ymm13 \n\t" + "vmovups 0x80(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "prefetcht0 0x180(%[filter]) \n\t" + "vmovups 0xA0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0xC0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "prefetcht0 0x1C0(%[filter]) \n\t" + "vmovups 0xE0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0x8(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x8(%[input1]), %%ymm13 \n\t" + "vmovups 0x100(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "prefetcht0 0x200(%[filter]) \n\t" + "vmovups 0x120(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x140(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "prefetcht0 0x240(%[filter]) \n\t" + "vmovups 0x160(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0xC(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0xC(%[input1]), %%ymm13 \n\t" + "vmovups 0x180(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "prefetcht0 0x280(%[filter]) \n\t" + "vmovups 0x1A0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x1C0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "prefetcht0 0x2C0(%[filter]) \n\t" + "vmovups 0x1E0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0x10(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x10(%[input1]), %%ymm13 \n\t" + "vmovups 0x200(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "prefetcht0 0x300(%[filter]) \n\t" + "vmovups 0x220(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x240(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "prefetcht0 0x340(%[filter]) \n\t" + "vmovups 0x260(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0x14(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x14(%[input1]), %%ymm13 \n\t" + "vmovups 0x280(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "prefetcht0 0x380(%[filter]) \n\t" + "vmovups 0x2A0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x2C0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "prefetcht0 0x3C0(%[filter]) \n\t" + "vmovups 0x2E0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0x18(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x18(%[input1]), %%ymm13 \n\t" + "vmovups 0x300(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "prefetcht0 0x400(%[filter]) \n\t" + "vmovups 0x320(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x340(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "prefetcht0 0x440(%[filter]) \n\t" + "vmovups 0x360(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "vbroadcastss 0x1C(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x1C(%[input1]), %%ymm13 \n\t" + "vmovups 0x380(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "prefetcht0 0x480(%[filter]) \n\t" + "vmovups 0x3A0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vmovups 0x3C0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm7 \n\t" + "prefetcht0 0x4C0(%[filter]) \n\t" + "vmovups 0x3E0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm9 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm10 \n\t" + + "add $0x20, %[input0] \n\t" + "add $0x20, %[input1] \n\t" + "add $0x400, %[filter] \n\t" + "dec %%rcx \n\t" + "jg 1b \n\t" + + "vmovups %%ymm0, (%[output]) \n\t" + "vmovups %%ymm3, 0x20(%[output]) \n\t" + "vmovups %%ymm6, 0x40(%[output]) \n\t" + "vmovups %%ymm9, 0x60(%[output]) \n\t" + "vmovups %%ymm1, 0x80(%[output]) \n\t" + "vmovups %%ymm4, 0xA0(%[output]) \n\t" + "vmovups %%ymm7, 0xC0(%[output]) \n\t" + "vmovups %%ymm10, 0xE0(%[output]) \n\t" + : + : [input0] "r"(c.input[0]), [input1] "r"(c.input[1]), [input2] "r"(c.input[2]), + [filter] "r"(c.filter), [output] "r"(c.output), [ic] "c"(c.ic) + : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", + "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", "cc"); +} + +void winoKernel1x32(ConvController &c) +{ + __asm__ __volatile__( + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorps %%ymm9, %%ymm9, %%ymm9 \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vbroadcastss (%[input0]), %%ymm12 \n\t" + "vmovups 0x0(%[filter]), %%ymm15 \n\t" + "vmovups 0x20(%[filter]), %%ymm10 \n\t" + "vmovups 0x40(%[filter]), %%ymm13 \n\t" + "vmovups 0x60(%[filter]), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm10, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x4(%[input0]), %%ymm12 \n\t" + "vmovups 0x80(%[filter]), %%ymm15 \n\t" + "vmovups 0xA0(%[filter]), %%ymm10 \n\t" + "vmovups 0xC0(%[filter]), %%ymm13 \n\t" + "vmovups 0xE0(%[filter]), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm10, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x8(%[input0]), %%ymm12 \n\t" + "vmovups 0x100(%[filter]), %%ymm15 \n\t" + "vmovups 0x120(%[filter]), %%ymm10 \n\t" + "vmovups 0x140(%[filter]), %%ymm13 \n\t" + "vmovups 0x160(%[filter]), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm10, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0xC(%[input0]), %%ymm12 \n\t" + "vmovups 0x180(%[filter]), %%ymm15 \n\t" + "vmovups 0x1A0(%[filter]), %%ymm10 \n\t" + "vmovups 0x1C0(%[filter]), %%ymm13 \n\t" + "vmovups 0x1E0(%[filter]), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm10, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x10(%[input0]), %%ymm12 \n\t" + "vmovups 0x200(%[filter]), %%ymm15 \n\t" + "vmovups 0x220(%[filter]), %%ymm10 \n\t" + "vmovups 0x240(%[filter]), %%ymm13 \n\t" + "vmovups 0x260(%[filter]), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm10, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x14(%[input0]), %%ymm12 \n\t" + "vmovups 0x280(%[filter]), %%ymm15 \n\t" + "vmovups 0x2A0(%[filter]), %%ymm10 \n\t" + "vmovups 0x2C0(%[filter]), %%ymm13 \n\t" + "vmovups 0x2E0(%[filter]), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm10, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x18(%[input0]), %%ymm12 \n\t" + "vmovups 0x300(%[filter]), %%ymm15 \n\t" + "vmovups 0x320(%[filter]), %%ymm10 \n\t" + "vmovups 0x340(%[filter]), %%ymm13 \n\t" + "vmovups 0x360(%[filter]), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm10, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "vbroadcastss 0x1C(%[input0]), %%ymm12 \n\t" + "vmovups 0x380(%[filter]), %%ymm15 \n\t" + "vmovups 0x3A0(%[filter]), %%ymm10 \n\t" + "vmovups 0x3C0(%[filter]), %%ymm13 \n\t" + "vmovups 0x3E0(%[filter]), %%ymm14 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm10, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm13, %%ymm12, %%ymm6 \n\t" + "vfmadd231ps %%ymm14, %%ymm12, %%ymm9 \n\t" + + "add $0x20, %[input0] \n\t" + "add $0x400, %[filter] \n\t" + "dec %%rcx \n\t" + "jg 1b \n\t" + + "vmovups %%ymm0, (%[output]) \n\t" + "vmovups %%ymm3, 0x20(%[output]) \n\t" + "vmovups %%ymm6, 0x40(%[output]) \n\t" + "vmovups %%ymm9, 0x60(%[output]) \n\t" + : + : [input0] "r"(c.input[0]), [filter] "r"(c.filter), [output] "r"(c.output), [ic] "c"(c.ic) + : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", + "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", "cc"); +} + +void winoKernel3x16(ConvController &c) +{ + __asm__ __volatile__( + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5, %%ymm5, %%ymm5 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vbroadcastss (%[input0]), %%ymm12 \n\t" + "vbroadcastss (%[input1]), %%ymm13 \n\t" + "vbroadcastss (%[input2]), %%ymm14 \n\t" + "vmovups 0x0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x20(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0x4(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x4(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0x4(%[input2]), %%ymm14 \n\t" + "vmovups 0x40(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x60(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0x8(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x8(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0x8(%[input2]), %%ymm14 \n\t" + "vmovups 0x80(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0xA0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0xC(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0xC(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0xC(%[input2]), %%ymm14 \n\t" + "vmovups 0xC0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0xE0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0x10(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x10(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0x10(%[input2]), %%ymm14 \n\t" + "vmovups 0x100(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x120(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0x14(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x14(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0x14(%[input2]), %%ymm14 \n\t" + "vmovups 0x140(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x160(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0x18(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x18(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0x18(%[input2]), %%ymm14 \n\t" + "vmovups 0x180(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x1A0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "vbroadcastss 0x1C(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x1C(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0x1C(%[input2]), %%ymm14 \n\t" + "vmovups 0x1C0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + "vmovups 0x1E0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm5 \n\t" + + "add $0x20, %[input0] \n\t" + "add $0x20, %[input1] \n\t" + "add $0x20, %[input2] \n\t" + "add $0x200, %[filter] \n\t" + "dec %%rcx \n\t" + "jg 1b \n\t" + + "vmovups %%ymm0, (%[output]) \n\t" + "vmovups %%ymm3, 0x20(%[output]) \n\t" + "vmovups %%ymm1, 0x40(%[output]) \n\t" + "vmovups %%ymm4, 0x60(%[output]) \n\t" + "vmovups %%ymm2, 0x80(%[output]) \n\t" + "vmovups %%ymm5, 0xA0(%[output]) \n\t" + : + : [input0] "r"(c.input[0]), [input1] "r"(c.input[1]), [input2] "r"(c.input[2]), + [filter] "r"(c.filter), [output] "r"(c.output), [ic] "c"(c.ic) + : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", + "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", "cc"); +} + +void winoKernel2x16(ConvController &c) +{ + __asm__ __volatile__( + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorps %%ymm4, %%ymm4, %%ymm4 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vbroadcastss (%[input0]), %%ymm12 \n\t" + "vbroadcastss (%[input1]), %%ymm13 \n\t" + "vmovups 0x0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x20(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0x4(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x4(%[input1]), %%ymm13 \n\t" + "vmovups 0x40(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x60(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0x8(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x8(%[input1]), %%ymm13 \n\t" + "vmovups 0x80(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0xA0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0xC(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0xC(%[input1]), %%ymm13 \n\t" + "vmovups 0xC0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0xE0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0x10(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x10(%[input1]), %%ymm13 \n\t" + "vmovups 0x100(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x120(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0x14(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x14(%[input1]), %%ymm13 \n\t" + "vmovups 0x140(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x160(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0x18(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x18(%[input1]), %%ymm13 \n\t" + "vmovups 0x180(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x1A0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "vbroadcastss 0x1C(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x1C(%[input1]), %%ymm13 \n\t" + "vmovups 0x1C0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vmovups 0x1E0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm4 \n\t" + + "add $0x20, %[input0] \n\t" + "add $0x20, %[input1] \n\t" + "add $0x200, %[filter] \n\t" + "dec %%rcx \n\t" + "jg 1b \n\t" + + "vmovups %%ymm0, (%[output]) \n\t" + "vmovups %%ymm3, 0x20(%[output]) \n\t" + "vmovups %%ymm1, 0x40(%[output]) \n\t" + "vmovups %%ymm4, 0x60(%[output]) \n\t" + : + : [input0] "r"(c.input[0]), [input1] "r"(c.input[1]), [input2] "r"(c.input[2]), + [filter] "r"(c.filter), [output] "r"(c.output), [ic] "c"(c.ic) + : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", + "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", "cc"); +} + +void winoKernel1x16(ConvController &c) +{ + __asm__ __volatile__( + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vbroadcastss (%[input0]), %%ymm12 \n\t" + "vmovups 0x0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x20(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0x4(%[input0]), %%ymm12 \n\t" + "vmovups 0x40(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x60(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0x8(%[input0]), %%ymm12 \n\t" + "vmovups 0x80(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0xA0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0xC(%[input0]), %%ymm12 \n\t" + "vmovups 0xC0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0xE0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0x10(%[input0]), %%ymm12 \n\t" + "vmovups 0x100(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x120(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0x14(%[input0]), %%ymm12 \n\t" + "vmovups 0x140(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x160(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0x18(%[input0]), %%ymm12 \n\t" + "vmovups 0x180(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x1A0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "vbroadcastss 0x1C(%[input0]), %%ymm12 \n\t" + "vmovups 0x1C0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vmovups 0x1E0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm3 \n\t" + + "add $0x20, %[input0] \n\t" + "add $0x200, %[filter] \n\t" + "dec %%rcx \n\t" + "jg 1b \n\t" + + "vmovups %%ymm0, (%[output]) \n\t" + "vmovups %%ymm3, 0x20(%[output]) \n\t" + : + : [input0] "r"(c.input[0]), [input1] "r"(c.input[1]), [input2] "r"(c.input[2]), + [filter] "r"(c.filter), [output] "r"(c.output), [ic] "c"(c.ic) + : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", + "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", "cc"); +} + +void winoKernel3x8(ConvController &c) +{ + __asm__ __volatile__( + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vbroadcastss (%[input0]), %%ymm12 \n\t" + "vbroadcastss (%[input1]), %%ymm13 \n\t" + "vbroadcastss (%[input2]), %%ymm14 \n\t" + "vmovups 0x0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0x4(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x4(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0x4(%[input2]), %%ymm14 \n\t" + "vmovups 0x20(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0x8(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x8(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0x8(%[input2]), %%ymm14 \n\t" + "vmovups 0x40(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0xC(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0xC(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0xC(%[input2]), %%ymm14 \n\t" + "vmovups 0x60(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0x10(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x10(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0x10(%[input2]), %%ymm14 \n\t" + "vmovups 0x80(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0x14(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x14(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0x14(%[input2]), %%ymm14 \n\t" + "vmovups 0xA0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0x18(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x18(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0x18(%[input2]), %%ymm14 \n\t" + "vmovups 0xC0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "vbroadcastss 0x1C(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x1C(%[input1]), %%ymm13 \n\t" + "vbroadcastss 0x1C(%[input2]), %%ymm14 \n\t" + "vmovups 0xE0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + "vfmadd231ps %%ymm15, %%ymm14, %%ymm2 \n\t" + + "add $0x20, %[input0] \n\t" + "add $0x20, %[input1] \n\t" + "add $0x20, %[input2] \n\t" + "add $0x100, %[filter] \n\t" + "dec %%rcx \n\t" + "jg 1b \n\t" + + "vmovups %%ymm0, (%[output]) \n\t" + "vmovups %%ymm1, 0x20(%[output]) \n\t" + "vmovups %%ymm2, 0x40(%[output]) \n\t" + : + : [input0] "r"(c.input[0]), [input1] "r"(c.input[1]), [input2] "r"(c.input[2]), + [filter] "r"(c.filter), [output] "r"(c.output), [ic] "c"(c.ic) + : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", + "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", "cc"); +} + +void winoKernel2x8(ConvController &c) +{ + __asm__ __volatile__( + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vbroadcastss (%[input0]), %%ymm12 \n\t" + "vbroadcastss (%[input1]), %%ymm13 \n\t" + "vmovups 0x0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0x4(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x4(%[input1]), %%ymm13 \n\t" + "vmovups 0x20(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0x8(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x8(%[input1]), %%ymm13 \n\t" + "vmovups 0x40(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0xC(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0xC(%[input1]), %%ymm13 \n\t" + "vmovups 0x60(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0x10(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x10(%[input1]), %%ymm13 \n\t" + "vmovups 0x80(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0x14(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x14(%[input1]), %%ymm13 \n\t" + "vmovups 0xA0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0x18(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x18(%[input1]), %%ymm13 \n\t" + "vmovups 0xC0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "vbroadcastss 0x1C(%[input0]), %%ymm12 \n\t" + "vbroadcastss 0x1C(%[input1]), %%ymm13 \n\t" + "vmovups 0xE0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + "vfmadd231ps %%ymm15, %%ymm13, %%ymm1 \n\t" + + "add $0x20, %[input0] \n\t" + "add $0x20, %[input1] \n\t" + "add $0x100, %[filter] \n\t" + "dec %%rcx \n\t" + "jg 1b \n\t" + + "vmovups %%ymm0, (%[output]) \n\t" + "vmovups %%ymm1, 0x20(%[output]) \n\t" + : + : [input0] "r"(c.input[0]), [input1] "r"(c.input[1]), [input2] "r"(c.input[2]), + [filter] "r"(c.filter), [output] "r"(c.output), [ic] "c"(c.ic) + : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", + "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", "cc"); +} + +void winoKernel1x8(ConvController &c) +{ + __asm__ __volatile__( + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vbroadcastss (%[input0]), %%ymm12 \n\t" + "vmovups 0x0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0x4(%[input0]), %%ymm12 \n\t" + "vmovups 0x20(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0x8(%[input0]), %%ymm12 \n\t" + "vmovups 0x40(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0xC(%[input0]), %%ymm12 \n\t" + "vmovups 0x60(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0x10(%[input0]), %%ymm12 \n\t" + "vmovups 0x80(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0x14(%[input0]), %%ymm12 \n\t" + "vmovups 0xA0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0x18(%[input0]), %%ymm12 \n\t" + "vmovups 0xC0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "vbroadcastss 0x1C(%[input0]), %%ymm12 \n\t" + "vmovups 0xE0(%[filter]), %%ymm15 \n\t" + "vfmadd231ps %%ymm15, %%ymm12, %%ymm0 \n\t" + + "add $0x20, %[input0] \n\t" + "add $0x100, %[filter] \n\t" + "dec %%rcx \n\t" + "jg 1b \n\t" + + "vmovups %%ymm0, (%[output]) \n\t" + : + : [input0] "r"(c.input[0]), [input1] "r"(c.input[1]), [input2] "r"(c.input[2]), + [filter] "r"(c.filter), [output] "r"(c.output), [ic] "c"(c.ic) + : "%rax", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", + "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory", "cc"); +} + +EE convolution_winograd(TensorDesc inputDesc, + F32 *inArray, + F32 *eltwiseInput, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ic, ih, iw; + U32 fn, fc, fh, fw; + U32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if ((fdf != DF_NCHWCxN32 && fdf != DF_NCHWCxN24) || (idf != DF_NCHWC8) || (ic % 8 != 0)) { + CHECK_STATUS(NOT_MATCH); + } + + if (activationDesc.mode != ACTIVATION_RELU && activationDesc.mode != ACTIVATION_NULL) { + CHECK_STATUS(NOT_SUPPORTED); + } + + // get kernels + const kernelFunc wino[3][3] = { + {winoKernel1x8, winoKernel2x8, winoKernel3x8}, + {winoKernel1x16, winoKernel2x16, winoKernel3x16}, + {winoKernel1x32, winoKernel2x32, winoKernel3x32}, + }; + + // get computing params + I32 strideH = convParamSpec.stride_h; + I32 strideW = convParamSpec.stride_w; + I32 paddingT = convParamSpec.pad_top; + I32 paddingB = convParamSpec.pad_bottom; + I32 paddingL = convParamSpec.pad_left; + I32 paddingR = convParamSpec.pad_right; + I32 dilateH = convParamSpec.dilatedRate_h; + I32 dilateW = convParamSpec.dilatedRate_w; + I32 ih_pad = ih + paddingT + paddingB; + I32 iw_pad = iw + paddingL + paddingR; + I32 ohow = oh * ow; + + I32 oPaddingR = (ow % 4 == 0) ? 0 : (4 - ow % 4); + I32 oPaddingB = (oh % 4 == 0) ? 0 : (4 - oh % 4); + I32 oh_pad = oh + oPaddingB; + I32 ow_pad = ow + oPaddingR; + paddingR += oPaddingR; + paddingB += oPaddingB; + + // infer block params + I32 ocBlockSizes[] = {8, 16, 32}; + I32 wSizes[] = {1, 2, 3}; + + // infer kernel params + ConvController convCtl; + convCtl.eltwise = nullptr; + F32 *iaddr[3]; + convCtl.input = iaddr; + bool noPadI = (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0); + bool noPadO = (oPaddingB == 0 && oPaddingR == 0); + for (U32 n = 0; n < in; ++n) { + F32 *bInArray = inArray + n * ic * ih * iw; + F32 *bOutArray = outArray + n * oc * oh * ow; + + I32 icSize = 0; + bool addF = false; + ActivationMode mode = ACTIVATION_NULL; + for (I32 icb = 0; icb < (int)ic; icb += icSize) { + icSize = UNI_MIN(BLOCK_IC_DIM, (int)ic - icb); + addF = (icb > 0); + if (icb == (int)ic - icSize) { + mode = activationDesc.mode; + } + + for (I32 h = 0; h < oh_pad; h += 4) { + I32 ocSize = 0; + for (U32 ocb = 0; ocb < oc; ocb += ocSize) { + ocSize = UNI_MIN(32, (int)oc - ocb); + ocSize = ocBlockSizes[ocSize >> 4]; + const F32 *bias = biasArray + ocb; + I32 wSize = 0; + for (I32 w = 0; w < ow_pad; w += 4 * wSize) { + wSize = UNI_MIN((int)ow_pad - w, 12); + wSize = wSize >> 2; + I32 in_w = w * strideW; + I32 in_h = h * strideH; + F32 *curI; + F32 *curO = bOutArray + ocb * oh * ow + (h * ow + w) * 8; + F32 *tmpI = (F32 *)tmp + 36 * icSize * w / 4; + F32 *buff = (F32 *)tmp + 36 * icSize * (ow_pad / 4 + 1); + F32 *tmpO = (F32 *)buff + 36 * 36 * wSize; + if (ocb == 0) { + if (noPadI) { + curI = bInArray + icb * ih * iw + (in_h * iw + in_w) * 8; + transformInput4x4_3x3( + curI, tmpI, buff, iw, ih, icSize, wSize, icSize); + } else { + in_w = (in_w > paddingL) ? (in_w - paddingL) : 0; + in_h = (in_h > paddingT) ? (in_h - paddingT) : 0; + curI = bInArray + icb * ih * iw + (in_h * iw + in_w) * 8; + transformInputWithPad4x4_3x3(curI, tmpI, buff, iw, ih, icSize, + wSize, icSize, paddingL, paddingR, paddingT, paddingB, h, w, + oh_pad, ow_pad); + } + } + + for (I32 i = 0; i < 36; ++i) { + convCtl.ic = icSize / 8; + convCtl.input[0] = tmpI + i * icSize; + convCtl.input[1] = tmpI + icSize * 36 * 1 + i * icSize; + convCtl.input[2] = tmpI + icSize * 36 * 2 + i * icSize; + convCtl.output = tmpO + i * ocSize * wSize; + convCtl.filter = filterArray + icb * fn * 36 + ocb * icSize * 36 + + i * ocSize * icSize; + wino[ocSize >> 4][wSize - 1](convCtl); + } + if (noPadO) { + transformOutput4x4_3x3( + tmpO, curO, buff, bias, ow, oh, ocSize, wSize, addF, mode); + } else { + transformOutputWithPad4x4_3x3(tmpO, curO, buff, bias, ow, oh, ocSize, + wSize, addF, oPaddingR, oPaddingB, h, w, mode); + } + } + } + } + } + } + + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/deconvolution_transform.cpp b/compute/tensor/src/cpu/x86/fp32/deconvolution_transform.cpp index f6e4fd28..eab965ff 100644 --- a/compute/tensor/src/cpu/x86/fp32/deconvolution_transform.cpp +++ b/compute/tensor/src/cpu/x86/fp32/deconvolution_transform.cpp @@ -17,7 +17,7 @@ template inline void transformCNHW2NCHWCxNxKernel( U32 fc, U32 fn, U32 fh, U32 fw, U32 fnPadding, const F32 *input, F32 *output) { - F32 *dest; + F32 *dest = nullptr; const F32 *src; U32 cSize = 0, cSizePadding = 0; U32 lstep = fh * fw; @@ -44,7 +44,7 @@ inline void transformCNHW2NCHWCxNxKernel( _mm256_storeu_ps(dest + 24, _mm256_i32gather_ps(src + 24 * lstep, vindex, 4)); } } - memset(dest + N, 0, ((cSizePadding - cSize) * N * 4)); + UNI_MEMSET(dest + N, 0, ((cSizePadding - cSize) * N * 4)); } } } @@ -85,7 +85,7 @@ inline EE transformCNHW2NCHWCxNx( tail -= 8; } if (tail > 0) { - F32 *dest; + F32 *dest = nullptr; const F32 *src; U32 cSize = 0, cSizePadding = 0; U32 hwMax = fh * fw - 1; @@ -108,7 +108,7 @@ inline EE transformCNHW2NCHWCxNx( dest = output + n * fh * fw * 8 + hw * cSizePadding * 8 + c8 * 8; _mm256_storeu_ps(dest, _mm256_mask_i32gather_ps(src256, src, vindex, mask, 4)); } - memset(dest + 8, 0, ((cSizePadding - cSize) * 32)); + UNI_MEMSET(dest + 8, 0, ((cSizePadding - cSize) * 32)); } } } @@ -169,7 +169,7 @@ inline EE deconvolution_transform_filter_kernel_fp32(TensorDesc filterDesc, CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); if (fdf == ftmDataFormat) { *ftmDesc = filterDesc; - memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); + UNI_MEMCPY(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); return SUCCESS; } if (fdf != DF_NCHW) { @@ -180,7 +180,7 @@ inline EE deconvolution_transform_filter_kernel_fp32(TensorDesc filterDesc, case DF_NCHWC24: { filterDesc = tensor4df(fdt, fdf, 1, fc, fh, fw); *ftmDesc = tensor4df(fdt, ftmDataFormat, 1, fc, fh, fw); - transformCNHW2NCHWCxNx<1, 24>(filterDesc, filterArray, *ftmDesc, ftmArray); + transformCNHW2NCHWCxNx<1, 16>(filterDesc, filterArray, *ftmDesc, ftmArray); *ftmDesc = tensor4df(fdt, ftmDataFormat, fn, fc, fh, fw); break; } diff --git a/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_direct.cpp b/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_direct.cpp index 8d841efd..d0e54d09 100644 --- a/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_direct.cpp +++ b/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_direct.cpp @@ -20,7 +20,7 @@ #include "cpu/x86/fp32/convolution_functions.h" #define UNROLL_W 4 -#define UNROLL_OC_BLOCK_DIM 24 +#define UNROLL_OC_BLOCK_DIM 16 typedef void (*kernelFunc)(F32 *in0, F32 *in1, @@ -334,6 +334,117 @@ void Avx2DwKernel4x16(F32 *in0, "%ymm15", "memory", "cc"); } +void Avx512DwKernel4x16(F32 *in0, + F32 *in1, + F32 *in2, + F32 *in3, + const F32 *curW, + F32 *curO, + const F32 *curB, + I32 fw, + I32 fh, + I32 oStep, + I32 iStep, + I32 hStep, + I32 flags, + I32 dw, + I32 wStep) +{ + __asm__ __volatile__("vmovups (%5), %%zmm0 \n\t" + "vmovups %%zmm0, %%zmm1 \n\t" + "vmovups %%zmm0, %%zmm2 \n\t" + "vmovups %%zmm0, %%zmm3 \n\t" + + "cmp $0, %%ecx \n\t" + "je 3f \n\t" + "cmp $0, %6 \n\t" + "je 3f \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %6, %%eax \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%4), %%zmm11 \n\t" + "vmovups (%0), %%zmm12 \n\t" + "vmovups (%1), %%zmm13 \n\t" + "vmovups (%2), %%zmm14 \n\t" + "vmovups (%3), %%zmm15 \n\t" + "vfmadd231ps %%zmm12, %%zmm11, %%zmm0 \n\t" + "vfmadd231ps %%zmm13, %%zmm11, %%zmm1 \n\t" + "prefetcht0 0x40(%4) \n\t" + "vfmadd231ps %%zmm14, %%zmm11, %%zmm2 \n\t" + "vfmadd231ps %%zmm15, %%zmm11, %%zmm3 \n\t" + + "add %12, %0 \n\t" + "add %12, %1 \n\t" + "add %12, %2 \n\t" + "add %12, %3 \n\t" + "add $0x40, %4 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + "add %10, %4 \n\t" + "add %9, %0 \n\t" + "add %9, %1 \n\t" + "add %9, %2 \n\t" + "add %9, %3 \n\t" + "dec %%ecx \n\t" + "jg 0b \n\t" + + // relu + "mov %11, %%eax \n\t" + "and $0x6, %%eax \n\t" + "je 3f \n\t" + "vxorps %%zmm15, %%zmm15, %%zmm15 \n\t" + "vmaxps %%zmm15, %%zmm0, %%zmm0 \n\t" + "vmaxps %%zmm15, %%zmm1, %%zmm1 \n\t" + "vmaxps %%zmm15, %%zmm2, %%zmm2 \n\t" + "vmaxps %%zmm15, %%zmm3, %%zmm3 \n\t" + "vmaxps %%zmm15, %%zmm4, %%zmm4 \n\t" + "vmaxps %%zmm15, %%zmm5, %%zmm5 \n\t" + "vmaxps %%zmm15, %%zmm6, %%zmm6 \n\t" + "vmaxps %%zmm15, %%zmm7, %%zmm7 \n\t" + + // relu6 + "and $0x4, %%eax \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%zmm12, %%zmm15, %%zmm12 \n\t" + "vminps %%zmm12, %%zmm0, %%zmm0 \n\t" + "vminps %%zmm12, %%zmm1, %%zmm1 \n\t" + "vminps %%zmm12, %%zmm2, %%zmm2 \n\t" + "vminps %%zmm12, %%zmm3, %%zmm3 \n\t" + "vminps %%zmm12, %%zmm4, %%zmm4 \n\t" + "vminps %%zmm12, %%zmm5, %%zmm5 \n\t" + "vminps %%zmm12, %%zmm6, %%zmm6 \n\t" + "vminps %%zmm12, %%zmm7, %%zmm7 \n\t" + + ".align 16 \n\t" + "3: \n\t" + : + : "r"(in0), "r"(in1), "r"(in2), "r"(in3), "r"(curW), "r"(curB), "r"(fw), + "c"(fh), "r"((I64)iStep), "r"((I64)hStep), "r"((I64)wStep), "r"(flags), + "r"((I64)dw) + : "%eax", "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", + "%zmm14", "%zmm15", "memory", "cc"); + + __asm__ __volatile__("vmovups %%zmm0, (%0) \n\t" + "vmovups %%zmm1, 0x40(%0) \n\t" + "vmovups %%zmm2, 0x80(%0) \n\t" + "vmovups %%zmm3, 0xC0(%0) \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(curO), "r"((I64)oStep) + : "%zmm0", "%zmm1", "%zmm2", "%zmm3", "memory", "cc"); +} + void Avx2DwKernel4x8(F32 *in0, F32 *in1, F32 *in2, @@ -593,6 +704,75 @@ void Avx2DwKernel1x16(F32 *in0, "%ymm14", "%ymm15", "memory", "cc"); } +void Avx512DwKernel1x16(F32 *in0, + F32 *in1, + F32 *in2, + F32 *in3, + const F32 *curW, + F32 *curO, + const F32 *curB, + I32 fw, + I32 fh, + I32 oStep, + I32 iStep, + I32 hStep, + I32 flags, + I32 dw, + I32 wStep) +{ + __asm__ __volatile__("vmovups (%3), %%zmm0 \n\t" + + "cmp $0, %%ecx \n\t" + "je 3f \n\t" + "cmp $0, %4 \n\t" + "je 3f \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %4, %%eax \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovaps (%1), %%zmm1 \n\t" + "vmovups (%0), %%zmm2 \n\t" + "vfmadd231ps %%zmm2, %%zmm1, %%zmm0 \n\t" + + "add %11, %0 \n\t" + "add $0x40, %1 \n\t" + "dec %%eax \n\t" + "jg 1b \n\t" + + "add %6, %1 \n\t" + "add %9, %0 \n\t" + "dec %%ecx \n\t" + "jg 0b \n\t" + + // relu + "mov %10, %%eax \n\t" + "and $0x6, %%eax \n\t" + "je 3f \n\t" + "vxorps %%zmm3, %%zmm3, %%zmm3 \n\t" + "vmaxps %%zmm3, %%zmm0, %%zmm0 \n\t" + + // relu6 + "and $0x4, %%eax \n\t" + "je 3f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm3 \n\t" + "vbroadcastss %%xmm3, %%zmm4 \n\t" + "vminps %%zmm4, %%zmm0, %%zmm0 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "vmovups %%zmm0, (%2) \n\t" + : + : "r"(in0), "r"(curW), "r"(curO), "r"(curB), "r"(fw), "c"(fh), + "r"((I64)wStep), "r"((I64)oStep), "r"((I64)iStep), "r"((I64)hStep), + "r"(flags), "r"((I64)dw) + : "%eax", "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "memory", "cc"); +} + void Avx2DwKernel1x8(F32 *in0, F32 *in1, F32 *in2, @@ -836,6 +1016,103 @@ inline void Avx2DwKernel33s14x24(F32 *in0, "%ymm15", "memory", "cc"); } +inline void Avx512DwKernel33s14x16(F32 *in0, + const F32 *curW, + F32 *curO, + const F32 *curB, + I32 oStep, + I32 iStep, + I32 hStep, + I32 flags, + I32 fh) +{ + __asm__ __volatile__( + "vmovups (%2), %%zmm0 \n\t" + "vmovups %%zmm0, %%zmm1 \n\t" + "vmovups %%zmm0, %%zmm2 \n\t" + "vmovups %%zmm0, %%zmm3 \n\t" + + ".align 16 \n\t" + "0: " + + "vmovaps (%1), %%zmm15 \n\t" + "vmovups (%0), %%zmm8 \n\t" + "vmovups 0x40(%0), %%zmm9 \n\t" + "vmovups 0x80(%0), %%zmm10 \n\t" + "vmovups 0xC0(%0), %%zmm11 \n\t" + "vfmadd231ps %%zmm8, %%zmm15, %%zmm0 \n\t" + "vfmadd231ps %%zmm9, %%zmm15, %%zmm1 \n\t" + "vfmadd231ps %%zmm10, %%zmm15, %%zmm2 \n\t" + "vfmadd231ps %%zmm11, %%zmm15, %%zmm3 \n\t" + + "vmovaps 0x40(%1), %%zmm15 \n\t" + "vmovups 0x100(%0), %%zmm8 \n\t" + "vfmadd231ps %%zmm9, %%zmm15, %%zmm0 \n\t" + "vfmadd231ps %%zmm10, %%zmm15, %%zmm1 \n\t" + "vfmadd231ps %%zmm11, %%zmm15, %%zmm2 \n\t" + "vfmadd231ps %%zmm8, %%zmm15, %%zmm3 \n\t" + + "vmovaps 0x80(%1), %%zmm15 \n\t" + "vmovups 0x140(%0), %%zmm12 \n\t" + "vfmadd231ps %%zmm10, %%zmm15, %%zmm0 \n\t" + "vfmadd231ps %%zmm11, %%zmm15, %%zmm1 \n\t" + "vfmadd231ps %%zmm8, %%zmm15, %%zmm2 \n\t" + "vfmadd231ps %%zmm12, %%zmm15, %%zmm3 \n\t" + + "add %4, %0 \n\t" + "add $0xC0, %1 \n\t" + + "dec %%ecx \n\t" + "jg 0b \n\t" + + // relu + "mov %5, %%eax \n\t" + "and $0x6, %%eax \n\t" + "je 1f \n\t" + "vxorps %%zmm15, %%zmm15, %%zmm15 \n\t" + "vmaxps %%zmm15, %%zmm0, %%zmm0 \n\t" + "vmaxps %%zmm15, %%zmm1, %%zmm1 \n\t" + "vmaxps %%zmm15, %%zmm2, %%zmm2 \n\t" + "vmaxps %%zmm15, %%zmm3, %%zmm3 \n\t" + "vmaxps %%zmm15, %%zmm4, %%zmm4 \n\t" + "vmaxps %%zmm15, %%zmm5, %%zmm5 \n\t" + "vmaxps %%zmm15, %%zmm6, %%zmm6 \n\t" + "vmaxps %%zmm15, %%zmm7, %%zmm7 \n\t" + + // relu6 + "and $0x4, %%eax \n\t" + "je 1f \n\t" + "mov $0x40C00000, %%eax \n\t" + "vmovd %%eax, %%xmm12 \n\t" + "vpermps %%zmm12, %%zmm15, %%zmm12 \n\t" + "vminps %%zmm12, %%zmm0, %%zmm0 \n\t" + "vminps %%zmm12, %%zmm1, %%zmm1 \n\t" + "vminps %%zmm12, %%zmm2, %%zmm2 \n\t" + "vminps %%zmm12, %%zmm3, %%zmm3 \n\t" + "vminps %%zmm12, %%zmm4, %%zmm4 \n\t" + "vminps %%zmm12, %%zmm5, %%zmm5 \n\t" + "vminps %%zmm12, %%zmm6, %%zmm6 \n\t" + "vminps %%zmm12, %%zmm7, %%zmm7 \n\t" + + ".align 16 \n\t" + "1: \n\t" + : + : "r"(in0), "r"(curW), "r"(curB), "r"((I64)iStep), "r"((I64)hStep), "r"(flags), "c"(fh) + : "%eax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", + "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", "%zmm15", "memory", + "cc"); + + __asm__ __volatile__("vmovups %%zmm0, (%0) \n\t" + "vmovups %%zmm1, 0x40(%0) \n\t" + "vmovups %%zmm2, 0x80(%0) \n\t" + "vmovups %%zmm3, 0xC0(%0) \n\t" + : + : "r"(curO), "r"((I64)oStep) + : "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", + "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", + "%zmm15", "memory", "cc"); +} + inline void Avx2DwKernel33s14x16(F32 *in0, const F32 *curW, F32 *curO, @@ -1242,25 +1519,28 @@ EE depthwise_convolution_direct(TensorDesc inputDesc, CHECK_STATUS(tensor4dGetI32(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_STATUS(tensor4dGetI32(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - if ((fdf != DF_NCHWC24 && fdf != DF_NCHWC8) || (idf != DF_NCHWC8) || (ic % 8 != 0)) { + if ((fdf != DF_NCHWC24 && fdf != DF_NCHWC8) || (idf != DF_NCHWC8 && idf != DF_NCHWC16) || (ic % 8 != 0)) { CHECK_STATUS(NOT_MATCH); } // get kernels - kernelFunc kernel[2][3] = {{Avx2DwKernel1x8, Avx2DwKernel1x16, Avx2DwKernel1x24}, - {Avx2DwKernel4x8, Avx2DwKernel4x16, Avx2DwKernel4x24}}; + kernelFunc kernel[3][2] = {{Avx2DwKernel1x8, Avx2DwKernel4x8}, + {Avx2DwKernel1x16, Avx2DwKernel4x16}, + {Avx2DwKernel1x24, Avx2DwKernel4x24}}; kernel33Func kernel33[2][3] = {{Avx2DwKernel33s18x8, Avx2DwKernel33s14x16, Avx2DwKernel33s14x24}, {Avx2DwKernel33s28x8, nullptr, nullptr}}; + kernelFunc kernel512[2] = {Avx512DwKernel1x16, Avx512DwKernel4x16}; + kernel33Func kernel51233[1] = {Avx512DwKernel33s14x16}; I32 unrollOcArray[3] = {8, 16, 24}; I32 unrollHw33s1Array[3] = {8, 4, 4}; // get computing params I32 strideH = convParamSpec.stride_h; I32 strideW = convParamSpec.stride_w; - I32 paddingT = convParamSpec.padding_top; - I32 paddingB = convParamSpec.padding_bottom; - I32 paddingL = convParamSpec.padding_left; - I32 paddingR = convParamSpec.padding_right; + I32 paddingT = convParamSpec.pad_top; + I32 paddingB = convParamSpec.pad_bottom; + I32 paddingL = convParamSpec.pad_left; + I32 paddingR = convParamSpec.pad_right; I32 dilateH = convParamSpec.dilatedRate_h; I32 dilateW = convParamSpec.dilatedRate_w; I32 fhDilated = (fh - 1) * dilateH + 1; @@ -1270,14 +1550,15 @@ EE depthwise_convolution_direct(TensorDesc inputDesc, // infer block params I32 unrollOc = UNROLL_OC_BLOCK_DIM; I32 unrollHw = UNROLL_W; + I32 cLen = (idf == DF_NCHWC16)? 16: 8; // infer kernel params - I32 oStep = oh * ow * SIMDW * BYTES; - I32 iStep = ih * iw * SIMDW * BYTES; - I32 hStep = (iw - fw * dilateW + (dilateH - 1) * iw) * SIMDW * BYTES; - I32 hStep33 = iw * SIMDW * BYTES; - I32 sw = strideW * SIMDW * BYTES; - I32 dw = dilateW * SIMDW * BYTES; + I32 oStep = oh * ow * cLen * BYTES; + I32 iStep = ih * iw * cLen * BYTES; + I32 hStep = (iw - fw * dilateW + (dilateH - 1) * iw) * cLen * BYTES; + I32 hStep33 = iw * cLen * BYTES; + I32 sw = strideW * cLen * BYTES; + I32 dw = dilateW * cLen * BYTES; // fuse dw+pw F32 *useOutArray = (F32 *)tmp; @@ -1300,6 +1581,13 @@ EE depthwise_convolution_direct(TensorDesc inputDesc, ocSize = UNI_MIN(unrollOc, ic - ocb); I32 ocIdx = (ocSize >> 3) - 1; ocSize = unrollOcArray[ocIdx]; + kernelFunc *wkernel = kernel[ocIdx]; + kernel33Func wkernel33 = kernel33[0][ocIdx]; + if (idf == DF_NCHWC16) { + ocSize = 16; + wkernel = kernel512; + wkernel33 = kernel51233[0]; + } if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { I32 wSize = 0; if (use3x3) { @@ -1309,15 +1597,15 @@ EE depthwise_convolution_direct(TensorDesc inputDesc, wSize = UNI_MIN(ow - w, unrollHw); I32 in_h_0 = h * strideH; I32 in_w_0 = w * strideW; - F32 *in_0 = curI + in_h_0 * iw * SIMDW + in_w_0 * SIMDW; - F32 *calO = curO + (h * ow + w) * SIMDW; + F32 *in_0 = curI + in_h_0 * iw * cLen + in_w_0 * cLen; + F32 *calO = curO + (h * ow + w) * cLen; if (wSize < unrollHw) { - kernel[0][ocIdx](in_0, nullptr, nullptr, nullptr, curW, calO, curB, + wkernel[0](in_0, nullptr, nullptr, nullptr, curW, calO, curB, fw, fh, oStep, iStep, hStep, flags, dw, 0); wSize = 1; } else { - kernel33[strideW - 1][ocIdx]( + wkernel33( in_0, curW, calO, curB, oStep, iStep, hStep33, flags, 3); } } @@ -1336,12 +1624,12 @@ EE depthwise_convolution_direct(TensorDesc inputDesc, I32 in_w_2 = (hw + 2) % ow * strideW; I32 in_h_3 = (hw + 3) / ow * strideH; I32 in_w_3 = (hw + 3) % ow * strideW; - F32 *in_0 = curI + in_h_0 * iw * SIMDW + in_w_0 * SIMDW; - F32 *in_1 = curI + in_h_1 * iw * SIMDW + in_w_1 * SIMDW; - F32 *in_2 = curI + in_h_2 * iw * SIMDW + in_w_2 * SIMDW; - F32 *in_3 = curI + in_h_3 * iw * SIMDW + in_w_3 * SIMDW; + F32 *in_0 = curI + in_h_0 * iw * cLen + in_w_0 * cLen; + F32 *in_1 = curI + in_h_1 * iw * cLen + in_w_1 * cLen; + F32 *in_2 = curI + in_h_2 * iw * cLen + in_w_2 * cLen; + F32 *in_3 = curI + in_h_3 * iw * cLen + in_w_3 * cLen; - kernel[wSize >> 2][ocIdx](in_0, in_1, in_2, in_3, curW, curO + hw * SIMDW, + wkernel[wSize >> 2](in_0, in_1, in_2, in_3, curW, curO + hw * cLen, curB, fw, fh, oStep, iStep, hStep, flags, dw, 0); } } @@ -1369,28 +1657,28 @@ EE depthwise_convolution_direct(TensorDesc inputDesc, inW = (inW >= 0) ? inW : iwJump; tfw = GetKernelnoDilated(tfw, dilateW); const F32 *useW = calW + wwJump * ocSize; - F32 *in_0 = curI + inH * iw * SIMDW + inW * SIMDW; - F32 *calO = curO + (h * ow + realW) * SIMDW; - hStep = (iw - tfw * dilateW + (dilateH - 1) * iw) * SIMDW * BYTES; - kernel[0][ocIdx](in_0, nullptr, nullptr, nullptr, useW, calO, curB, tfw, + F32 *in_0 = curI + inH * iw * cLen + inW * cLen; + F32 *calO = curO + (h * ow + realW) * cLen; + hStep = (iw - tfw * dilateW + (dilateH - 1) * iw) * cLen * BYTES; + wkernel[0](in_0, nullptr, nullptr, nullptr, useW, calO, curB, tfw, tfh, oStep, iStep, hStep, flags, dw, (fw - tfw) * ocSize * BYTES); } w = owPaddingL; I32 wSize = 0; - hStep = (iw - fw * dilateW + (dilateH - 1) * iw) * SIMDW * BYTES; + hStep = (iw - fw * dilateW + (dilateH - 1) * iw) * cLen * BYTES; if (use3x3) { unrollHw = unrollHw33s1Array[ocIdx]; for (; w < ow - owPaddingR; w += wSize) { wSize = UNI_MIN(ow - owPaddingR - w, unrollHw); I32 in_w_0 = w * strideW - paddingL; - F32 *in_0 = curI + inH * iw * SIMDW + in_w_0 * SIMDW; - F32 *calO = curO + (h * ow + w) * SIMDW; + F32 *in_0 = curI + inH * iw * cLen + in_w_0 * cLen; + F32 *calO = curO + (h * ow + w) * cLen; if (wSize < unrollHw) { - kernel[0][ocIdx](in_0, nullptr, nullptr, nullptr, calW, calO, curB, + wkernel[0](in_0, nullptr, nullptr, nullptr, calW, calO, curB, fw, tfh, oStep, iStep, hStep, flags, dw, 0); wSize = 1; } else { - kernel33[strideW - 1][ocIdx]( + wkernel33( in_0, calW, calO, curB, oStep, iStep, hStep33, flags, tfh); } } @@ -1404,13 +1692,13 @@ EE depthwise_convolution_direct(TensorDesc inputDesc, I32 in_w_1 = (w + 1) * strideW - paddingL; I32 in_w_2 = (w + 2) * strideW - paddingL; I32 in_w_3 = (w + 3) * strideW - paddingL; - F32 *in_0 = curI + inH * iw * SIMDW + in_w_0 * SIMDW; - F32 *in_1 = curI + inH * iw * SIMDW + in_w_1 * SIMDW; - F32 *in_2 = curI + inH * iw * SIMDW + in_w_2 * SIMDW; - F32 *in_3 = curI + inH * iw * SIMDW + in_w_3 * SIMDW; - F32 *calO = curO + (h * ow + w) * SIMDW; + F32 *in_0 = curI + inH * iw * cLen + in_w_0 * cLen; + F32 *in_1 = curI + inH * iw * cLen + in_w_1 * cLen; + F32 *in_2 = curI + inH * iw * cLen + in_w_2 * cLen; + F32 *in_3 = curI + inH * iw * cLen + in_w_3 * cLen; + F32 *calO = curO + (h * ow + w) * cLen; - kernel[wSize >> 2][ocIdx](in_0, in_1, in_2, in_3, calW, calO, curB, fw, + wkernel[wSize >> 2](in_0, in_1, in_2, in_3, calW, calO, curB, fw, tfh, oStep, iStep, hStep, flags, dw, 0); } } @@ -1424,7 +1712,7 @@ EE depthwise_convolution_direct(TensorDesc inputDesc, tmpBytes -= oh * ic * oh * ow + 32; tmp = (void *)((F32 *)tmp + oh * ic * oh * ow + 32); ConvolutionParamSpec p = createConvolutionParamSpec( - 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, fn, Convolution_Pointwise); + 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, fn, CONVOLUTION_POINTWISE); convolution_1x1_direct(pwInputDesc, useOutArray, eltwiseInput, pwFilterDesc, pwFilterArray, p, pwBiasArray, tmpBytes, tmp, outputDesc, outArray, pointwiseActivationParamSpec); } diff --git a/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_transform.cpp b/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_transform.cpp index fd6b15ca..ae32441c 100644 --- a/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_transform.cpp +++ b/compute/tensor/src/cpu/x86/fp32/depthwise_convolution_transform.cpp @@ -29,7 +29,7 @@ inline EE depthwise_convolution_transform_filter_kernel_fp32(TensorDesc filterDe CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); if (fdf == ftmDataFormat) { *ftmDesc = filterDesc; - memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); + UNI_MEMCPY(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); return SUCCESS; } if (fdf != DF_NCHW) { @@ -39,7 +39,7 @@ inline EE depthwise_convolution_transform_filter_kernel_fp32(TensorDesc filterDe *ftmDesc = tensor4df(fdt, ftmDataFormat, fc, 1, fh, fw); switch (ftmDataFormat) { case DF_NCHWC24: { - transformNCHWToNCHWCxNx<1, 24>(filterDesc, filterArray, *ftmDesc, ftmArray); + transformNCHWToNCHWCxNx<1, 16>(filterDesc, filterArray, *ftmDesc, ftmArray); break; } case DF_NCHWC8: { diff --git a/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution.cpp b/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution.cpp index b431d50c..5118606f 100644 --- a/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution.cpp +++ b/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution.cpp @@ -51,7 +51,7 @@ EE depthwise_pointwise_convolution_fp32(TensorDesc inputDesc, if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) { CHECK_STATUS(NOT_MATCH); } - if (!(idf == DF_NCHWC8 && odf == DF_NCHWC8)) { + if (!(idf == DF_NCHWC8 || idf == DF_NCHWC16)) { CHECK_STATUS(NOT_MATCH); } if (ic != fc) { diff --git a/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution_transform.cpp b/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution_transform.cpp index d6ffa6ca..2aae9f3f 100644 --- a/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution_transform.cpp +++ b/compute/tensor/src/cpu/x86/fp32/depthwise_pointwise_convolution_transform.cpp @@ -31,7 +31,7 @@ EE depthwise_pointwise_convolution_transform_filter_fp32(TensorDesc dwFilterDesc } ConvolutionParamSpec p = createConvolutionParamSpec(1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, - 1, pwFilterDesc.dims[pwFilterDesc.nDims - 1], Convolution_Pointwise); + 1, pwFilterDesc.dims[pwFilterDesc.nDims - 1], CONVOLUTION_POINTWISE); ret = convolution_transform_filter_fp32( pwFilterDesc, pwFilter, p, CONVOLUTION_ALGORITHM_POINTWISE, pwFtmDesc, pwFilterTransformed); CHECK_STATUS(ret); diff --git a/compute/tensor/src/cpu/x86/fp32/eltwise.cpp b/compute/tensor/src/cpu/x86/fp32/eltwise.cpp index 4094c251..9f89a0fa 100644 --- a/compute/tensor/src/cpu/x86/fp32/eltwise.cpp +++ b/compute/tensor/src/cpu/x86/fp32/eltwise.cpp @@ -104,17 +104,24 @@ EE eltwise_fp32(std::vector input, EltwiseMode eltwiseMode) { EE ret = SUCCESS; - if ((num == 2) && (inputSize[0] == (I32)len) && (inputSize[0] == inputSize[1])) { + if ((num == 2) && (inputSize[0] != 1) && (inputSize[1] != 1)) { F32 *in0 = (F32 *)input[0]; F32 *in1 = (F32 *)input[1]; F32 *out = (F32 *)output; + len = UNI_MIN(inputSize[0], inputSize[1]); #ifdef _USE_OPENMP - U32 BLOCK = ((len + OMP_NUM_THREADS - 1) / OMP_NUM_THREADS + 7) / 8 * 8; + U32 ompBlock = ((len + OMP_NUM_THREADS - 1) / OMP_NUM_THREADS + 7) / 8 * 8; + U32 BLOCK = UNI_MAX(64, ompBlock); U32 blockNum = (len + BLOCK - 1) / BLOCK; -#pragma omp parallel num_threads(OMP_NUM_THREADS) - { + int in_parallel = omp_in_parallel(); + if (in_parallel != 0) { + BLOCK = len; + blockNum = 1; + } +#pragma omp parallel num_threads(OMP_NUM_THREADS) if (in_parallel == 0) #endif + { switch (eltwiseMode) { case ELTWISE_SUM: { #ifdef _USE_OPENMP @@ -125,7 +132,7 @@ EE eltwise_fp32(std::vector input, eltwise_kernel(vaddps, vaddss, blockSize, in0 + off, in1 + off, out + off); } #else - eltwise_kernel(vaddps, vaddss, len, in0, in1, out); + eltwise_kernel(vaddps, vaddss, len, in0, in1, out); #endif break; } @@ -138,7 +145,7 @@ EE eltwise_fp32(std::vector input, eltwise_kernel(vmaxps, vmaxss, blockSize, in0 + off, in1 + off, out + off); } #else - eltwise_kernel(vmaxps, vmaxss, len, in0, in1, out); + eltwise_kernel(vmaxps, vmaxss, len, in0, in1, out); #endif break; } @@ -152,7 +159,7 @@ EE eltwise_fp32(std::vector input, eltwise_kernel(vmulps, vmulss, blockSize, in0 + off, in1 + off, out + off); } #else - eltwise_kernel(vmulps, vmulss, len, in0, in1, out); + eltwise_kernel(vmulps, vmulss, len, in0, in1, out); #endif break; } @@ -165,7 +172,7 @@ EE eltwise_fp32(std::vector input, eltwise_kernel(vsubps, vsubss, blockSize, in0 + off, in1 + off, out + off); } #else - eltwise_kernel(vsubps, vsubss, len, in0, in1, out); + eltwise_kernel(vsubps, vsubss, len, in0, in1, out); #endif break; } @@ -178,80 +185,93 @@ EE eltwise_fp32(std::vector input, eltwise_kernel(vdivps, vdivss, blockSize, in0 + off, in1 + off, out + off); } #else - eltwise_kernel(vdivps, vdivss, len, in0, in1, out); + eltwise_kernel(vdivps, vdivss, len, in0, in1, out); #endif break; } default: ret = NOT_SUPPORTED; + break; } -#ifdef _USE_OPENMP } -#endif return ret; } - F32 buffer[8]; - F32 *tmp = buffer; U32 len_tail = len % 8; U32 len_main = len - len_tail; - F32 *output_ptr = (F32 *)output; - for (U32 i = 0; i < len_main; i += 8) { - get_vector((F32 *)input[0], inputSize[0], &tmp, 8, i, 8, buffer); - __m256 tmp_v = _mm256_loadu_ps(tmp); - for (U32 j = 1; j < num; j++) { - get_vector((F32 *)input[j], inputSize[j], &tmp, 8, i, 8, buffer); - __m256 value_v = _mm256_loadu_ps(tmp); - switch (eltwiseMode) { - case ELTWISE_SUM: - tmp_v = _mm256_add_ps(value_v, tmp_v); - break; - case ELTWISE_MAX: - tmp_v = _mm256_max_ps(value_v, tmp_v); - break; - case ELTWISE_PROD: - tmp_v = _mm256_mul_ps(value_v, tmp_v); - break; - case ELTWISE_SUB: - tmp_v = _mm256_sub_ps(tmp_v, value_v); - break; - case ELTWISE_DIV: - tmp_v = _mm256_div_ps(tmp_v, value_v); - break; - default: - ret = NOT_SUPPORTED; +#ifdef _USE_OPENMP + int in_parallel = omp_in_parallel(); +#pragma omp parallel num_threads(OMP_NUM_THREADS) if (in_parallel == 0) +#endif + { + F32 buffer[8]; + F32 *tmp = buffer; + F32 *output_ptr = (F32 *)output; +#ifdef _USE_OPENMP +#pragma omp for +#endif + for (U32 i = 0; i < len_main; i += 8) { + get_vector((F32 *)input[0], inputSize[0], &tmp, 8, i, 8, buffer); + __m256 tmp_v = _mm256_loadu_ps(tmp); + for (U32 j = 1; j < num; j++) { + get_vector((F32 *)input[j], inputSize[j], &tmp, 8, i, 8, buffer); + __m256 value_v = _mm256_loadu_ps(tmp); + switch (eltwiseMode) { + case ELTWISE_SUM: + tmp_v = _mm256_add_ps(value_v, tmp_v); + break; + case ELTWISE_MAX: + tmp_v = _mm256_max_ps(value_v, tmp_v); + break; + case ELTWISE_PROD: + tmp_v = _mm256_mul_ps(value_v, tmp_v); + break; + case ELTWISE_SUB: + tmp_v = _mm256_sub_ps(tmp_v, value_v); + break; + case ELTWISE_DIV: + tmp_v = _mm256_div_ps(tmp_v, value_v); + break; + default: + ret = NOT_SUPPORTED; + break; + } } + _mm256_storeu_ps(output_ptr + i, tmp_v); } - _mm256_storeu_ps(output_ptr + i, tmp_v); - } - for (U32 i = len_main; i < len; i++) { - get_vector((F32 *)input[0], inputSize[0], &tmp, 8, i, 1, buffer); - F32 tmp_s = tmp[0]; - for (U32 j = 1; j < num; j++) { - get_vector((F32 *)input[j], inputSize[j], &tmp, 8, i, 1, buffer); - F32 value_s = tmp[0]; - switch (eltwiseMode) { - case ELTWISE_SUM: - tmp_s = value_s + tmp_s; - break; - case ELTWISE_MAX: - tmp_s = (value_s > tmp_s) ? value_s : tmp_s; - break; - case ELTWISE_PROD: - tmp_s *= value_s; - break; - case ELTWISE_SUB: - tmp_s = tmp_s - value_s; - break; - case ELTWISE_DIV: - tmp_s = tmp_s / value_s; - break; - default: - ret = NOT_SUPPORTED; +#ifdef _USE_OPENMP +#pragma omp for +#endif + for (U32 i = len_main; i < len; i++) { + get_vector((F32 *)input[0], inputSize[0], &tmp, 8, i, 1, buffer); + F32 tmp_s = tmp[0]; + for (U32 j = 1; j < num; j++) { + get_vector((F32 *)input[j], inputSize[j], &tmp, 8, i, 1, buffer); + F32 value_s = tmp[0]; + switch (eltwiseMode) { + case ELTWISE_SUM: + tmp_s = value_s + tmp_s; + break; + case ELTWISE_MAX: + tmp_s = (value_s > tmp_s) ? value_s : tmp_s; + break; + case ELTWISE_PROD: + tmp_s *= value_s; + break; + case ELTWISE_SUB: + tmp_s = tmp_s - value_s; + break; + case ELTWISE_DIV: + tmp_s = tmp_s / value_s; + break; + default: + ret = NOT_SUPPORTED; + break; + } } + output_ptr[i] = tmp_s; } - output_ptr[i] = tmp_s; } return ret; } @@ -264,17 +284,23 @@ EE eltwise_i32(std::vector input, EltwiseMode eltwiseMode) { EE ret = SUCCESS; - if ((num == 2) && (inputSize[0] == (I32)len) && (inputSize[0] == inputSize[1])) { + if ((num == 2) && (inputSize[0] != 1) && (inputSize[1] != 1)) { I32 *in0 = (I32 *)input[0]; I32 *in1 = (I32 *)input[1]; I32 *out = (I32 *)output; #ifdef _USE_OPENMP - U32 BLOCK = ((len + OMP_NUM_THREADS - 1) / OMP_NUM_THREADS + 7) / 8 * 8; + U32 ompBlock = ((len + OMP_NUM_THREADS - 1) / OMP_NUM_THREADS + 7) / 8 * 8; + U32 BLOCK = UNI_MAX(64, ompBlock); U32 blockNum = (len + BLOCK - 1) / BLOCK; -#pragma omp parallel num_threads(OMP_NUM_THREADS) - { + int in_parallel = omp_in_parallel(); + if (in_parallel != 0) { + BLOCK = len; + blockNum = 1; + } +#pragma omp parallel num_threads(OMP_NUM_THREADS) if (in_parallel == 0) #endif + { switch (eltwiseMode) { case ELTWISE_SUM: { #ifdef _USE_OPENMP @@ -285,7 +311,7 @@ EE eltwise_i32(std::vector input, eltwise_kernel(vpaddd, vpaddd, blockSize, in0 + off, in1 + off, out + off); } #else - eltwise_kernel(vpaddd, vpaddd, len, in0, in1, out); + eltwise_kernel(vpaddd, vpaddd, len, in0, in1, out); #endif break; } @@ -298,7 +324,7 @@ EE eltwise_i32(std::vector input, eltwise_kernel(vpmaxsd, vpmaxsd, blockSize, in0 + off, in1 + off, out + off); } #else - eltwise_kernel(vpmaxsd, vpmaxsd, len, in0, in1, out); + eltwise_kernel(vpmaxsd, vpmaxsd, len, in0, in1, out); #endif break; } @@ -311,7 +337,7 @@ EE eltwise_i32(std::vector input, eltwise_kernel(vpmulld, vpmulld, blockSize, in0 + off, in1 + off, out + off); } #else - eltwise_kernel(vpmulld, vpmulld, len, in0, in1, out); + eltwise_kernel(vpmulld, vpmulld, len, in0, in1, out); #endif break; } @@ -324,20 +350,90 @@ EE eltwise_i32(std::vector input, eltwise_kernel(vpsubd, vpsubd, blockSize, in0 + off, in1 + off, out + off); } #else - eltwise_kernel(vpsubd, vpsubd, len, in0, in1, out); + eltwise_kernel(vpsubd, vpsubd, len, in0, in1, out); #endif break; } default: ret = NOT_SUPPORTED; + break; } -#ifdef _USE_OPENMP } -#endif return ret; } - return NOT_SUPPORTED; + U32 len_tail = len % 8; + U32 len_main = len - len_tail; +#ifdef _USE_OPENMP + int in_parallel = omp_in_parallel(); +#pragma omp parallel num_threads(OMP_NUM_THREADS) if (in_parallel == 0) +#endif + { + I32 buffer[8]; + I32 *tmp = buffer; + I32 *output_ptr = (I32 *)output; +#ifdef _USE_OPENMP +#pragma omp for +#endif + for (U32 i = 0; i < len_main; i += 8) { + get_vector((I32 *)input[0], inputSize[0], &tmp, 8, i, 8, buffer); + __m256i tmp_v = _mm256_loadu_si256((const __m256i *)tmp); + for (U32 j = 1; j < num; j++) { + get_vector((I32 *)input[j], inputSize[j], &tmp, 8, i, 8, buffer); + __m256i value_v = _mm256_loadu_si256((const __m256i *)tmp); + switch (eltwiseMode) { + case ELTWISE_SUM: + tmp_v = _mm256_add_epi32(value_v, tmp_v); + break; + case ELTWISE_MAX: + tmp_v = _mm256_max_epi32(value_v, tmp_v); + break; + case ELTWISE_PROD: + tmp_v = _mm256_mullo_epi32(value_v, tmp_v); + break; + case ELTWISE_SUB: + tmp_v = _mm256_sub_epi32(tmp_v, value_v); + break; + default: + ret = NOT_SUPPORTED; + } + } + _mm256_storeu_si256((__m256i *)(output_ptr + i), tmp_v); + } + +#ifdef _USE_OPENMP +#pragma omp for +#endif + for (U32 i = len_main; i < len; i++) { + get_vector((I32 *)input[0], inputSize[0], &tmp, 8, i, 1, buffer); + I32 tmp_s = tmp[0]; + for (U32 j = 1; j < num; j++) { + get_vector((I32 *)input[j], inputSize[j], &tmp, 8, i, 1, buffer); + I32 value_s = tmp[0]; + switch (eltwiseMode) { + case ELTWISE_SUM: + tmp_s = value_s + tmp_s; + break; + case ELTWISE_MAX: + tmp_s = (value_s > tmp_s) ? value_s : tmp_s; + break; + case ELTWISE_PROD: + tmp_s *= value_s; + break; + case ELTWISE_SUB: + tmp_s = tmp_s - value_s; + break; + case ELTWISE_DIV: + tmp_s = tmp_s / value_s; + break; + default: + ret = NOT_SUPPORTED; + } + } + output_ptr[i] = tmp_s; + } + } + return ret; } EE eltwise_u8(std::vector input, @@ -348,55 +444,69 @@ EE eltwise_u8(std::vector input, EltwiseMode eltwiseMode) { EE ret = SUCCESS; - U8 buffer[32]; - U8 *tmp = buffer; U32 len_tail = len % 32; U32 len_main = len - len_tail; - U8 *output_ptr = (U8 *)output; - for (U32 i = 0; i < len_main; i += 32) { - get_vector((U8 *)input[0], inputSize[0], &tmp, 32, i, 32, buffer); - __m256i tmp_v = _mm256_loadu_si256((__m256i const *)tmp); - for (U32 j = 1; j < num; j++) { - get_vector((U8 *)input[j], inputSize[j], &tmp, 32, i, 32, buffer); - __m256i value_v = _mm256_loadu_si256((__m256i const *)tmp); - switch (eltwiseMode) { - case ELTWISE_AND: - tmp_v = _mm256_and_si256(value_v, tmp_v); - break; - case ELTWISE_OR: - tmp_v = _mm256_or_si256(value_v, tmp_v); - break; - case ELTWISE_XOR: - tmp_v = _mm256_xor_si256(value_v, tmp_v); - break; - default: - ret = NOT_SUPPORTED; +#ifdef _USE_OPENMP + int in_parallel = omp_in_parallel(); +#pragma omp parallel num_threads(OMP_NUM_THREADS) if (in_parallel == 0) +#endif + { + U8 buffer[32]; + U8 *tmp = buffer; + U8 *output_ptr = (U8 *)output; +#ifdef _USE_OPENMP +#pragma omp for +#endif + for (U32 i = 0; i < len_main; i += 32) { + get_vector((U8 *)input[0], inputSize[0], &tmp, 32, i, 32, buffer); + __m256i tmp_v = _mm256_loadu_si256((__m256i const *)tmp); + for (U32 j = 1; j < num; j++) { + get_vector((U8 *)input[j], inputSize[j], &tmp, 32, i, 32, buffer); + __m256i value_v = _mm256_loadu_si256((__m256i const *)tmp); + switch (eltwiseMode) { + case ELTWISE_AND: + tmp_v = _mm256_and_si256(value_v, tmp_v); + break; + case ELTWISE_OR: + tmp_v = _mm256_or_si256(value_v, tmp_v); + break; + case ELTWISE_XOR: + tmp_v = _mm256_xor_si256(value_v, tmp_v); + break; + default: + ret = NOT_SUPPORTED; + break; + } } + _mm256_storeu_si256((__m256i *)(output_ptr + i), tmp_v); } - _mm256_storeu_si256((__m256i *)(output_ptr + i), tmp_v); - } - for (U32 i = len_main; i < len; i++) { - get_vector((U8 *)input[0], inputSize[0], &tmp, 32, i, 1, buffer); - U8 tmp_s = tmp[0]; - for (U32 j = 1; j < num; j++) { - get_vector((U8 *)input[j], inputSize[j], &tmp, 32, i, 1, buffer); - U8 value_s = tmp[0]; - switch (eltwiseMode) { - case ELTWISE_AND: - tmp_s = value_s & tmp_s; - break; - case ELTWISE_OR: - tmp_s = value_s | tmp_s; - break; - case ELTWISE_XOR: - tmp_s = value_s ^ tmp_s; - break; - default: - ret = NOT_SUPPORTED; +#ifdef _USE_OPENMP +#pragma omp for +#endif + for (U32 i = len_main; i < len; i++) { + get_vector((U8 *)input[0], inputSize[0], &tmp, 32, i, 1, buffer); + U8 tmp_s = tmp[0]; + for (U32 j = 1; j < num; j++) { + get_vector((U8 *)input[j], inputSize[j], &tmp, 32, i, 1, buffer); + U8 value_s = tmp[0]; + switch (eltwiseMode) { + case ELTWISE_AND: + tmp_s = value_s & tmp_s; + break; + case ELTWISE_OR: + tmp_s = value_s | tmp_s; + break; + case ELTWISE_XOR: + tmp_s = value_s ^ tmp_s; + break; + default: + ret = NOT_SUPPORTED; + break; + } } + output_ptr[i] = tmp_s; } - output_ptr[i] = tmp_s; } return ret; } diff --git a/compute/tensor/src/cpu/x86/fp32/gru.cpp b/compute/tensor/src/cpu/x86/fp32/gru.cpp index 93a7b62c..877b94fa 100644 --- a/compute/tensor/src/cpu/x86/fp32/gru.cpp +++ b/compute/tensor/src/cpu/x86/fp32/gru.cpp @@ -11,7 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include #include "cpu/x86/fp32/tensor_computing_fp32.h" #include "cpu/x86/fp32/mvm_nkn32.h" @@ -54,10 +53,10 @@ EE grucell_fp32(TensorDesc xDesc, U32 batch = in; I32 xDim = ix; - I32 hDim = rnnParamSpec.numOutput; - I32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection - : rnnParamSpec.numOutput; - int num1 = rnnParamSpec.biDirection ? 2 : 1; + I32 hDim = rnnParamSpec.num_outputs; + I32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection + : rnnParamSpec.num_outputs; + int num1 = rnnParamSpec.bi_direction ? 2 : 1; U32 steps = batchStrideH / hDim / num1; if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) { CHECK_STATUS(NOT_MATCH); @@ -65,8 +64,7 @@ EE grucell_fp32(TensorDesc xDesc, if (!(3 * column == (I32)fn * 32 && (ix + oh) == fk && in == on)) { CHECK_STATUS(NOT_MATCH); } - ActivationMode activationMode = rnnParamSpec.activationMode; - if (activationMode != ACTIVATION_TANH) { + if (rnnParamSpec.activation_type != ACTIVATION_TANH) { CHECK_STATUS(NOT_SUPPORTED); } @@ -85,12 +83,12 @@ EE grucell_fp32(TensorDesc xDesc, F32 *currentBatchH = currentHArray + m * currentHStride; F32 *currentOutput = outputArray + m * batchStrideH; if (xDim > 0) { - memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32)); - memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(F32)); + UNI_MEMCPY(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32)); + UNI_MEMCPY(xhArray + xDim, lastBatchH, hDim * sizeof(F32)); } else { intermediateH = tmpArray; xhArray = lastBatchH; - memcpy(currentOutput, lastBatchH, hDim * sizeof(F32)); + UNI_MEMCPY(currentOutput, lastBatchH, hDim * sizeof(F32)); } const F32 *mBias = (const F32 *)bias[0] + m * steps * column * 3; @@ -147,7 +145,7 @@ EE grucell_fp32(TensorDesc xDesc, array_scale_f32(out_z, out_z, column, -1, 1); array_mul_f32(out_z, out_h, out_h, column); array_add_f32(out_r, out_h, currentOutput, column); - memcpy(currentBatchH, currentOutput, sizeof(F32) * hDim); + UNI_MEMCPY(currentBatchH, currentOutput, sizeof(F32) * hDim); } return SUCCESS; } diff --git a/compute/tensor/src/cpu/x86/fp32/instance_norm.cpp b/compute/tensor/src/cpu/x86/fp32/instance_norm.cpp index b80f19da..b0f62cbe 100644 --- a/compute/tensor/src/cpu/x86/fp32/instance_norm.cpp +++ b/compute/tensor/src/cpu/x86/fp32/instance_norm.cpp @@ -60,11 +60,15 @@ EE instance_norm_fp32(TensorDesc inputDesc, } for (I32 i = 0; i < loopOuter; i += 8) { + __m256 m1 = _mm256_setzero_ps(); __m256 m = _mm256_setzero_ps(); for (I32 j = 0; j < loopInner; ++j) { - m = _mm256_add_ps(m, _mm256_loadu_ps(input + i * loopInner + j * 8)); + m1 = _mm256_add_ps(m1, _mm256_loadu_ps(input + i * loopInner + j * 8)); + if (((j + 1) % 1024 == 0) || (j == loopInner - 1)) { + m = _mm256_add_ps(m, _mm256_div_ps(m1, loopInner_v)); + m1 = _mm256_setzero_ps(); + } } - m = _mm256_div_ps(m, loopInner_v); __m256 v = _mm256_setzero_ps(); for (I32 j = 0; j < loopInner; ++j) { __m256 t = _mm256_sub_ps(_mm256_loadu_ps(input + i * loopInner + j * 8), m); diff --git a/compute/tensor/src/cpu/x86/fp32/lstm.cpp b/compute/tensor/src/cpu/x86/fp32/lstm.cpp index 3094c40b..526051e5 100644 --- a/compute/tensor/src/cpu/x86/fp32/lstm.cpp +++ b/compute/tensor/src/cpu/x86/fp32/lstm.cpp @@ -11,7 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include #include "cpu/x86/fp32/tensor_computing_fp32.h" #include "cpu/x86/fp32/mvm_nkn32.h" @@ -54,10 +53,10 @@ EE lstmcell_fp32(TensorDesc xDesc, U32 batch = in; I32 xDim = ix; - I32 hDim = rnnParamSpec.numOutput; - I32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection - : rnnParamSpec.numOutput; - int num1 = rnnParamSpec.biDirection ? 2 : 1; + I32 hDim = rnnParamSpec.num_outputs; + I32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection + : rnnParamSpec.num_outputs; + int num1 = rnnParamSpec.bi_direction ? 2 : 1; U32 steps = batchStrideH / hDim / num1; if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) { CHECK_STATUS(NOT_MATCH); @@ -65,9 +64,8 @@ EE lstmcell_fp32(TensorDesc xDesc, if (!(4 * column == (I32)fn * 32 && (ix + oh) == fk && in == on)) { CHECK_STATUS(NOT_MATCH); } - F32 forgetBias = rnnParamSpec.forgetBias; - ActivationMode activationMode = rnnParamSpec.activationMode; - if (activationMode != ACTIVATION_TANH) { + F32 forgetBias = rnnParamSpec.forget_bias; + if (rnnParamSpec.activation_type != ACTIVATION_TANH) { CHECK_STATUS(NOT_SUPPORTED); } @@ -88,8 +86,8 @@ EE lstmcell_fp32(TensorDesc xDesc, for (U32 m = 0; m < batch; m++) { F32 *lastBatchH = lastHArray + m * lastHStride; if (xDim > 0) { - memcpy(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32)); - memcpy(xhArray + xDim, lastBatchH, hDim * sizeof(F32)); + UNI_MEMCPY(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32)); + UNI_MEMCPY(xhArray + xDim, lastBatchH, hDim * sizeof(F32)); } else { intermediateH = tmpArray; xhArray = lastBatchH; @@ -108,12 +106,12 @@ EE lstmcell_fp32(TensorDesc xDesc, F32 *currentOutput = outputArray + m * batchStrideH; F32 *tmpState, *tmpHH, *tmpH; - if (rnnParamSpec.zoneoutCell == 0) { + if (rnnParamSpec.zoneout_cell == 0) { tmpState = currentBatchState; } else { tmpState = out_i; } - if (rnnParamSpec.numProjection > 0) { + if (rnnParamSpec.num_projection > 0) { tmpHH = out_g; tmpH = currentOutput; } else { @@ -148,27 +146,27 @@ EE lstmcell_fp32(TensorDesc xDesc, tmpState[h] = C_s; tmpHH[h] = value; } - if (rnnParamSpec.zoneoutCell != 0) { - array_scale_f32(tmpState, tmpState, column, 1 - rnnParamSpec.zoneoutCell, 0); - array_scale_f32(lastBatchState, lastBatchState, column, rnnParamSpec.zoneoutCell, 0); + if (rnnParamSpec.zoneout_cell != 0) { + array_scale_f32(tmpState, tmpState, column, 1 - rnnParamSpec.zoneout_cell, 0); + array_scale_f32(lastBatchState, lastBatchState, column, rnnParamSpec.zoneout_cell, 0); array_add_f32(tmpState, lastBatchState, currentBatchState, column); } - if (rnnParamSpec.numProjection > 0) { - mvm_nkn32_with_bias(hDim / 32, rnnParamSpec.numProjection, (const F32 *)filter[1], + if (rnnParamSpec.num_projection > 0) { + mvm_nkn32_with_bias(hDim / 32, rnnParamSpec.num_projection, (const F32 *)filter[1], tmpHH, tmpH, nullptr); } - if (rnnParamSpec.zoneoutOutput != 0) { - if (rnnParamSpec.numProjection > 0) { - array_scale_f32(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + if (rnnParamSpec.zoneout_output != 0) { + if (rnnParamSpec.num_projection > 0) { + array_scale_f32(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneout_output, 0); } else { - array_scale_f32(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneoutOutput, 0); + array_scale_f32(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneout_output, 0); } - array_scale_f32(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneoutOutput, 0); + array_scale_f32(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneout_output, 0); array_add_f32(out_f, lastBatchH, currentBatchH, hDim); } else { - memcpy(currentBatchH, currentOutput, sizeof(F32) * hDim); + UNI_MEMCPY(currentBatchH, currentOutput, sizeof(F32) * hDim); } } return SUCCESS; diff --git a/compute/tensor/src/cpu/x86/fp32/mvm_nkn32.h b/compute/tensor/src/cpu/x86/fp32/mvm_nkn32.h index cc5db76f..88f31c14 100644 --- a/compute/tensor/src/cpu/x86/fp32/mvm_nkn32.h +++ b/compute/tensor/src/cpu/x86/fp32/mvm_nkn32.h @@ -23,6 +23,7 @@ inline void mvm_nkn32_with_bias( #pragma omp parallel for num_threads(OMP_NUM_THREADS) #endif for (U32 n = 0; n < fn; ++n) { + FTZ; const F32 *f = filterArray + n * fk * 32; F32 *out = output + n * 32; const F32 *b = bias + n * 32; diff --git a/compute/tensor/src/cpu/x86/fp32/normalization.cpp b/compute/tensor/src/cpu/x86/fp32/normalization.cpp index 46b08655..7d6276bc 100644 --- a/compute/tensor/src/cpu/x86/fp32/normalization.cpp +++ b/compute/tensor/src/cpu/x86/fp32/normalization.cpp @@ -14,10 +14,11 @@ #include #include "cpu/x86/fp32/tensor_computing_fp32.h" -inline void array_norm_scale_fp32( +static F32 eps = 1e-6; + +inline static void array_norm_scale_fp32( F32 *input, F32 *output, I32 len, F32 mean, F32 var, F32 *alpha, F32 *beta) { - F32 eps = 1e-6; F32 std_value = sqrt(var + eps); __m256 mean_v = _mm256_set1_ps(mean); __m256 std_v = _mm256_set1_ps(std_value); @@ -38,17 +39,17 @@ inline void array_norm_scale_fp32( } } -EE layer_normalization_fp32( +static EE layer_normalization_nhwc( TensorDesc inputDesc, F32 *input, F32 *alpha, F32 *beta, TensorDesc outputDesc, F32 *output) { UNUSED(outputDesc); - if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) { - CHECK_STATUS(NULL_POINTER); - } - U32 size = tensorNumElements(inputDesc); I32 size_inner = inputDesc.dims[0]; I32 size_outer = size / size_inner; + +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) schedule(static) +#endif for (I32 i = 0; i < size_outer; i++) { F32 *current_input = input + i * size_inner; F32 *current_output = output + i * size_inner; @@ -57,6 +58,81 @@ EE layer_normalization_fp32( array_norm_scale_fp32(current_input, current_output, size_inner, mean, var, alpha, beta); } + return SUCCESS; +} + +static EE layer_normalization_nchwc8( + TensorDesc inputDesc, F32 *input, F32 *alpha, F32 *beta, TensorDesc outputDesc, F32 *output) +{ + UNUSED(outputDesc); + int n = inputDesc.dims[inputDesc.nDims - 1]; + int c = inputDesc.dims[inputDesc.nDims - 2]; + int hw = 1; + for (unsigned int i = 0; i < inputDesc.nDims - 2; i++) { + hw *= inputDesc.dims[i]; + } + int c8 = c / 8; + int nums = n * hw; +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) schedule(static) +#endif + for (int x = 0; x < nums; ++x) { + int i = x / hw; + int j = x % hw; + __m256 sum_v = _mm256_set1_ps(0); + for (int k = 0; k < c8; k++) { + int id = ((i * c8 + k) * hw + j) * 8; + sum_v = _mm256_add_ps(sum_v, _mm256_loadu_ps(input + id)); + } + F32 mean = _mm256_sum_ps(sum_v) / c; + __m256 mean_v = _mm256_set1_ps(mean); + + sum_v = _mm256_set1_ps(0); + for (int k = 0; k < c8; k++) { + int id = ((i * c8 + k) * hw + j) * 8; + __m256 tmp_v = _mm256_sub_ps(_mm256_loadu_ps(input + id), mean_v); + sum_v = _mm256_fmadd_ps(tmp_v, tmp_v, sum_v); + } + F32 var = _mm256_sum_ps(sum_v) / c; + F32 std_value = sqrt(var + eps); + + __m256 std_v = _mm256_set1_ps(std_value); + for (int k = 0, kk = 0; k < c8; k++, kk += 8) { + int id = ((i * c8 + k) * hw + j) * 8; + __m256 in = _mm256_loadu_ps(input + id); + __m256 alpha_v = _mm256_loadu_ps(alpha + kk); + __m256 beta_v = _mm256_loadu_ps(beta + kk); + __m256 tmp_v = _mm256_sub_ps(in, mean_v); + tmp_v = _mm256_div_ps(tmp_v, std_v); + tmp_v = _mm256_fmadd_ps(alpha_v, tmp_v, beta_v); + _mm256_storeu_ps(output + id, tmp_v); + } + } return SUCCESS; } + +EE layer_normalization_fp32(TensorDesc inputDesc, + F32 *input, + LayerNormParamSpec p, + F32 *alpha, + F32 *beta, + TensorDesc outputDesc, + F32 *output) +{ + if (nullptr == alpha || nullptr == beta || nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + EE ret = NOT_SUPPORTED; + if (inputDesc.df == DF_NCHWC8) { + if (p.axis == 1) { + ret = layer_normalization_nchwc8(inputDesc, input, alpha, beta, outputDesc, output); + } + } else { + if (p.axis == -1) { + ret = layer_normalization_nhwc(inputDesc, input, alpha, beta, outputDesc, output); + } + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/fp32/pooling.cpp b/compute/tensor/src/cpu/x86/fp32/pooling.cpp index ec456ab7..7c92885d 100644 --- a/compute/tensor/src/cpu/x86/fp32/pooling.cpp +++ b/compute/tensor/src/cpu/x86/fp32/pooling.cpp @@ -12,6 +12,7 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "cpu/x86/fp32/tensor_computing_fp32.h" +#include "cpu/x86/fp32/pooling_kernel.h" #define UNROLL_W 4 @@ -19,255 +20,8 @@ typedef void (*pooling_max_func)(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 typedef void (*pooling_mean_func)( const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize); -void pooling_max_w4(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) -{ - __asm__ __volatile__("mov %%eax, %%eax \n\t" - "mov %4, %%eax \n\t" - "mov %%rax, %%rdi \n\t" - "mov %%eax, %%eax \n\t" - "mov %5, %%eax \n\t" - "mov %%rax, %%r9 \n\t" - "add %%r9, %%r9 \n\t" - "mov %%rax, %%r10 \n\t" - "add %%r9, %%r10 \n\t" - "add %0, %%rax \n\t" - "add %0, %%r9 \n\t" - "add %0, %%r10 \n\t" - - "vmovups (%0), %%ymm0 \n\t" - "vmovups (%%rax), %%ymm1 \n\t" - "vmovups (%%r9), %%ymm2 \n\t" - "vmovups (%%r10), %%ymm3 \n\t" - - ".align 16 \n\t" - "0: \n\t" - - "mov %2, %%ecx \n\t" - ".align 16 \n\t" - "1: \n\t" - - "vmovups (%0), %%ymm4 \n\t" - "vmovups (%%rax), %%ymm5 \n\t" - "vmovups (%%r9), %%ymm6 \n\t" - "vmovups (%%r10), %%ymm7 \n\t" - - "vmaxps %%ymm0, %%ymm4, %%ymm0 \n\t" - "vmaxps %%ymm1, %%ymm5, %%ymm1 \n\t" - "vmaxps %%ymm2, %%ymm6, %%ymm2 \n\t" - "vmaxps %%ymm3, %%ymm7, %%ymm3 \n\t" - - "add $0x20, %0 \n\t" - "add $0x20, %%rax \n\t" - "add $0x20, %%r9 \n\t" - "add $0x20, %%r10 \n\t" - "dec %%ecx \n\t" - "jg 1b \n\t" - - "add %%rdi, %0 \n\t" - "add %%rdi, %%rax \n\t" - "add %%rdi, %%r9 \n\t" - "add %%rdi, %%r10 \n\t" - "dec %%ebx \n\t" - "jg 0b \n\t" - - "vmovups %%ymm0, (%1) \n\t" - "vmovups %%ymm1, 0x20(%1) \n\t" - "vmovups %%ymm2, 0x40(%1) \n\t" - "vmovups %%ymm3, 0x60(%1) \n\t" - : - : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) - : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%ymm0", "%ymm1", "%ymm2", - "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "memory", "cc"); -} - -void pooling_max_w2(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) -{ - __asm__ __volatile__( - "mov %%eax, %%eax \n\t" - "mov %4, %%eax \n\t" - "mov %%rax, %%rdi \n\t" - "mov %%eax, %%eax \n\t" - "mov %5, %%eax \n\t" - "add %0, %%rax \n\t" - "vmovups (%0), %%ymm0 \n\t" - "vmovups (%%rax), %%ymm1 \n\t" - ".align 16 \n\t" - "0: \n\t" - "mov %2, %%ecx \n\t" - ".align 16 \n\t" - "1: \n\t" - "vmovups (%0), %%ymm4 \n\t" - "vmovups (%%rax), %%ymm5 \n\t" - "vmaxps %%ymm0, %%ymm4, %%ymm0 \n\t" - "vmaxps %%ymm1, %%ymm5, %%ymm1 \n\t" - "add $0x20, %0 \n\t" - "add $0x20, %%rax \n\t" - "dec %%ecx \n\t" - "jg 1b \n\t" - "add %%rdi, %0 \n\t" - "add %%rdi, %%rax \n\t" - "dec %%ebx \n\t" - "jg 0b \n\t" - "vmovups %%ymm0, (%1) \n\t" - "vmovups %%ymm1, 0x20(%1) \n\t" - : - : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) - : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm1", "%ymm4", "%ymm5", "memory", "cc"); -} - -void pooling_max_w1(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) -{ - __asm__ __volatile__("mov %%eax, %%eax \n\t" - "mov %4, %%eax \n\t" - "mov %%rax, %%rdi \n\t" - "vmovups (%0), %%ymm0 \n\t" - ".align 16 \n\t" - "0: \n\t" - "mov %2, %%ecx \n\t" - ".align 16 \n\t" - "1: \n\t" - "vmovups (%0), %%ymm4 \n\t" - "vmaxps %%ymm0, %%ymm4, %%ymm0 \n\t" - "add $0x20, %0 \n\t" - "dec %%ecx \n\t" - "jg 1b \n\t" - "add %%rdi, %0 \n\t" - "dec %%ebx \n\t" - "jg 0b \n\t" - "vmovups %%ymm0, (%1) \n\t" - : - : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) - : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm4", "memory", "cc"); -} - -void pooling_mean_w4(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize) -{ - __asm__ __volatile__( - "mov %%eax, %%eax \n\t" - "mov %4, %%eax \n\t" - "mov %%rax, %%rdi \n\t" - "mov %5, %%eax \n\t" - "mov %%rax, %%r9 \n\t" - "add %%r9, %%r9 \n\t" - "mov %%rax, %%r10 \n\t" - "add %%r9, %%r10 \n\t" - "add %0, %%rax \n\t" - "add %0, %%r9 \n\t" - "add %0, %%r10 \n\t" - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" - "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" - "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" - "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" - ".align 16 \n\t" - "0: \n\t" - "mov %2, %%ecx \n\t" - ".align 16 \n\t" - "1: \n\t" - "vmovups (%0), %%ymm4 \n\t" - "vmovups (%%rax), %%ymm5 \n\t" - "vmovups (%%r9), %%ymm6 \n\t" - "vmovups (%%r10), %%ymm7 \n\t" - "vaddps %%ymm0, %%ymm4, %%ymm0 \n\t" - "vaddps %%ymm1, %%ymm5, %%ymm1 \n\t" - "vaddps %%ymm2, %%ymm6, %%ymm2 \n\t" - "vaddps %%ymm3, %%ymm7, %%ymm3 \n\t" - "add $0x20, %0 \n\t" - "add $0x20, %%rax \n\t" - "add $0x20, %%r9 \n\t" - "add $0x20, %%r10 \n\t" - "dec %%ecx \n\t" - "jg 1b \n\t" - "add %%rdi, %0 \n\t" - "add %%rdi, %%rax \n\t" - "add %%rdi, %%r9 \n\t" - "add %%rdi, %%r10 \n\t" - "dec %%ebx \n\t" - "jg 0b \n\t" - "vbroadcastss (%6), %%ymm4 \n\t" - "vdivps %%ymm4, %%ymm0, %%ymm0 \n\t" - "vdivps %%ymm4, %%ymm1, %%ymm1 \n\t" - "vdivps %%ymm4, %%ymm2, %%ymm2 \n\t" - "vdivps %%ymm4, %%ymm3, %%ymm3 \n\t" - "vmovups %%ymm0, (%1) \n\t" - "vmovups %%ymm1, 0x20(%1) \n\t" - "vmovups %%ymm2, 0x40(%1) \n\t" - "vmovups %%ymm3, 0x60(%1) \n\t" - : - : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize) - : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%ymm0", "%ymm1", "%ymm2", "%ymm3", - "%ymm4", "%ymm5", "%ymm6", "%ymm7", "memory", "cc"); -} - -void pooling_mean_w2(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize) -{ - __asm__ __volatile__( - "mov %%eax, %%eax \n\t" - "mov %4, %%eax \n\t" - "mov %%rax, %%rdi \n\t" - "mov %5, %%eax \n\t" - "add %0, %%rax \n\t" - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" - "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" - ".align 16 \n\t" - "0: \n\t" - "mov %2, %%ecx \n\t" - ".align 16 \n\t" - "1: \n\t" - "vmovups (%0), %%ymm4 \n\t" - "vmovups (%%rax), %%ymm5 \n\t" - "vaddps %%ymm0, %%ymm4, %%ymm0 \n\t" - "vaddps %%ymm1, %%ymm5, %%ymm1 \n\t" - "add $0x20, %0 \n\t" - "add $0x20, %%rax \n\t" - "dec %%ecx \n\t" - "jg 1b \n\t" - "add %%rdi, %0 \n\t" - "add %%rdi, %%rax \n\t" - "dec %%ebx \n\t" - "jg 0b \n\t" - "vbroadcastss (%6), %%ymm4 \n\t" - "vdivps %%ymm4, %%ymm0, %%ymm0 \n\t" - "vdivps %%ymm4, %%ymm1, %%ymm1 \n\t" - "vmovups %%ymm0, (%1) \n\t" - "vmovups %%ymm1, 0x20(%1) \n\t" - : - : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize) - : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm1", "%ymm4", "%ymm5", "memory", "cc"); -} - -void pooling_mean_w1(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize) -{ - __asm__ __volatile__( - "mov %%eax, %%eax \n\t" - "mov %4, %%eax \n\t" - "mov %%rax, %%rdi \n\t" - "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" - ".align 16 \n\t" - "0: \n\t" - "mov %2, %%ecx \n\t" - ".align 16 \n\t" - "1: \n\t" - "vmovups (%0), %%ymm4 \n\t" - "vaddps %%ymm0, %%ymm4, %%ymm0 \n\t" - "add $0x20, %0 \n\t" - "dec %%ecx \n\t" - "jg 1b \n\t" - "add %%rdi, %0 \n\t" - "dec %%ebx \n\t" - "jg 0b \n\t" - "vbroadcastss (%6), %%ymm4 \n\t" - "vdivps %%ymm4, %%ymm0, %%ymm0 \n\t" - "vmovups %%ymm0, (%1) \n\t" - : - : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize) - : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm4", "memory", "cc"); -} - -EE pooling_fp32(TensorDesc inputDesc, - const F32 *input, - PoolingParamSpec poolingParamSpec, - TensorDesc outputDesc, - F32 *output) +EE pooling_fp32( + TensorDesc inputDesc, const F32 *input, PoolingParamSpec p, TensorDesc outputDesc, F32 *output) { if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); @@ -288,15 +42,15 @@ EE pooling_fp32(TensorDesc inputDesc, CHECK_STATUS(NOT_MATCH); } - PoolingMode pm = poolingParamSpec.mode; - U32 strideH = poolingParamSpec.stride_h; - U32 strideW = poolingParamSpec.stride_w; - U32 paddingT = poolingParamSpec.padding_top; - U32 paddingL = poolingParamSpec.padding_left; - U32 kernelSizeH = poolingParamSpec.kernel_h; - U32 kernelSizeW = poolingParamSpec.kernel_w; + PoolingMode pm = p.mode; + U32 strideH = p.stride_h; + U32 strideW = p.stride_w; + U32 paddingT = p.pad_top; + U32 paddingL = p.pad_left; + U32 kernelSizeH = p.kernel_h; + U32 kernelSizeW = p.kernel_w; U32 wSize, kh, kw, iStep; - F32 poolSize, *curO; + F32 *curO; const F32 *curI; if (paddingT >= kernelSizeH || paddingL >= kernelSizeW) { CHECK_STATUS(NOT_SUPPORTED); @@ -307,6 +61,7 @@ EE pooling_fp32(TensorDesc inputDesc, U32 wSizes[3] = {1, 2, 4}; pooling_max_func pooling_max[3] = {pooling_max_w1, pooling_max_w2, pooling_max_w4}; pooling_mean_func pooling_mean[3] = {pooling_mean_w1, pooling_mean_w2, pooling_mean_w4}; + F32 poolSize = kernelSizeH * kernelSizeW; for (U32 n = 0; n < in; n++) { for (U32 c = 0; c < ic; c++) { for (U32 h = 0; h < oh; h++) { @@ -329,7 +84,9 @@ EE pooling_fp32(TensorDesc inputDesc, kh = hend - hstart; kw = wend - wstart; iStep = (iw - kw) * 32; - poolSize = kw * kh * 1.0f; + if (!p.count_include_pad) { + poolSize = kh * kw; + } if (kw < kernelSizeW) { wSize = 1; } @@ -344,7 +101,7 @@ EE pooling_fp32(TensorDesc inputDesc, break; } default: - CHECK_STATUS(NOT_SUPPORTED); + return NOT_SUPPORTED; } } } diff --git a/compute/tensor/src/cpu/x86/fp32/pooling_avx512.cpp b/compute/tensor/src/cpu/x86/fp32/pooling_avx512.cpp index 538e6b71..78b79ffe 100644 --- a/compute/tensor/src/cpu/x86/fp32/pooling_avx512.cpp +++ b/compute/tensor/src/cpu/x86/fp32/pooling_avx512.cpp @@ -12,6 +12,7 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "cpu/x86/fp32/tensor_computing_fp32.h" +#include "cpu/x86/fp32/pooling_kernel.h" #define UNROLL_W 4 @@ -19,258 +20,8 @@ typedef void (*pooling_max_func)(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 typedef void (*pooling_mean_func)( const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize); -void pooling_c16_max_w4(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) -{ - __asm__ __volatile__("mov %%eax, %%eax \n\t" - "mov %4, %%eax \n\t" - "mov %%rax, %%rdi \n\t" - "mov %%eax, %%eax \n\t" - "mov %5, %%eax \n\t" - "mov %%rax, %%r9 \n\t" - "add %%r9, %%r9 \n\t" - "mov %%rax, %%r10 \n\t" - "add %%r9, %%r10 \n\t" - "add %0, %%rax \n\t" - "add %0, %%r9 \n\t" - "add %0, %%r10 \n\t" - - "vmovups (%0), %%zmm0 \n\t" - "vmovups (%%rax), %%zmm1 \n\t" - "vmovups (%%r9), %%zmm2 \n\t" - "vmovups (%%r10), %%zmm3 \n\t" - - ".align 16 \n\t" - "0: \n\t" - - "mov %2, %%ecx \n\t" - ".align 16 \n\t" - "1: \n\t" - - "vmovups (%0), %%zmm4 \n\t" - "vmovups (%%rax), %%zmm5 \n\t" - "vmovups (%%r9), %%zmm6 \n\t" - "vmovups (%%r10), %%zmm7 \n\t" - - "vmaxps %%zmm0, %%zmm4, %%zmm0 \n\t" - "vmaxps %%zmm1, %%zmm5, %%zmm1 \n\t" - "vmaxps %%zmm2, %%zmm6, %%zmm2 \n\t" - "vmaxps %%zmm3, %%zmm7, %%zmm3 \n\t" - - "add $0x40, %0 \n\t" - "add $0x40, %%rax \n\t" - "add $0x40, %%r9 \n\t" - "add $0x40, %%r10 \n\t" - "dec %%ecx \n\t" - "jg 1b \n\t" - - "add %%rdi, %0 \n\t" - "add %%rdi, %%rax \n\t" - "add %%rdi, %%r9 \n\t" - "add %%rdi, %%r10 \n\t" - "dec %%ebx \n\t" - "jg 0b \n\t" - - "vmovups %%zmm0, (%1) \n\t" - "vmovups %%zmm1, 0x40(%1) \n\t" - "vmovups %%zmm2, 0x80(%1) \n\t" - "vmovups %%zmm3, 0xC0(%1) \n\t" - : - : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) - : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%zmm0", "%zmm1", "%zmm2", - "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "memory", "cc"); -} - -void pooling_c16_max_w2(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) -{ - __asm__ __volatile__( - "mov %%eax, %%eax \n\t" - "mov %4, %%eax \n\t" - "mov %%rax, %%rdi \n\t" - "mov %%eax, %%eax \n\t" - "mov %5, %%eax \n\t" - "add %0, %%rax \n\t" - "vmovups (%0), %%zmm0 \n\t" - "vmovups (%%rax), %%zmm1 \n\t" - ".align 16 \n\t" - "0: \n\t" - "mov %2, %%ecx \n\t" - ".align 16 \n\t" - "1: \n\t" - "vmovups (%0), %%zmm4 \n\t" - "vmovups (%%rax), %%zmm5 \n\t" - "vmaxps %%zmm0, %%zmm4, %%zmm0 \n\t" - "vmaxps %%zmm1, %%zmm5, %%zmm1 \n\t" - "add $0x40, %0 \n\t" - "add $0x40, %%rax \n\t" - "dec %%ecx \n\t" - "jg 1b \n\t" - "add %%rdi, %0 \n\t" - "add %%rdi, %%rax \n\t" - "dec %%ebx \n\t" - "jg 0b \n\t" - "vmovups %%zmm0, (%1) \n\t" - "vmovups %%zmm1, 0x40(%1) \n\t" - : - : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) - : "%eax", "%rax", "%ecx", "%rdi", "%zmm0", "%zmm1", "%zmm4", "%zmm5", "memory", "cc"); -} - -void pooling_c16_max_w1(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) -{ - __asm__ __volatile__("mov %%eax, %%eax \n\t" - "mov %4, %%eax \n\t" - "mov %%rax, %%rdi \n\t" - "vmovups (%0), %%zmm0 \n\t" - ".align 16 \n\t" - "0: \n\t" - "mov %2, %%ecx \n\t" - ".align 16 \n\t" - "1: \n\t" - "vmovups (%0), %%zmm4 \n\t" - "vmaxps %%zmm0, %%zmm4, %%zmm0 \n\t" - "add $0x40, %0 \n\t" - "dec %%ecx \n\t" - "jg 1b \n\t" - "add %%rdi, %0 \n\t" - "dec %%ebx \n\t" - "jg 0b \n\t" - "vmovups %%zmm0, (%1) \n\t" - : - : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) - : "%eax", "%rax", "%ecx", "%rdi", "%zmm0", "%zmm4", "memory", "cc"); -} - -void pooling_c16_mean_w4( - const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize) -{ - __asm__ __volatile__( - "mov %%eax, %%eax \n\t" - "mov %4, %%eax \n\t" - "mov %%rax, %%rdi \n\t" - "mov %5, %%eax \n\t" - "mov %%rax, %%r9 \n\t" - "add %%r9, %%r9 \n\t" - "mov %%rax, %%r10 \n\t" - "add %%r9, %%r10 \n\t" - "add %0, %%rax \n\t" - "add %0, %%r9 \n\t" - "add %0, %%r10 \n\t" - "vxorps %%zmm0, %%zmm0, %%zmm0 \n\t" - "vxorps %%zmm1, %%zmm1, %%zmm1 \n\t" - "vxorps %%zmm2, %%zmm2, %%zmm2 \n\t" - "vxorps %%zmm3, %%zmm3, %%zmm3 \n\t" - ".align 16 \n\t" - "0: \n\t" - "mov %2, %%ecx \n\t" - ".align 16 \n\t" - "1: \n\t" - "vmovups (%0), %%zmm4 \n\t" - "vmovups (%%rax), %%zmm5 \n\t" - "vmovups (%%r9), %%zmm6 \n\t" - "vmovups (%%r10), %%zmm7 \n\t" - "vaddps %%zmm0, %%zmm4, %%zmm0 \n\t" - "vaddps %%zmm1, %%zmm5, %%zmm1 \n\t" - "vaddps %%zmm2, %%zmm6, %%zmm2 \n\t" - "vaddps %%zmm3, %%zmm7, %%zmm3 \n\t" - "add $0x40, %0 \n\t" - "add $0x40, %%rax \n\t" - "add $0x40, %%r9 \n\t" - "add $0x40, %%r10 \n\t" - "dec %%ecx \n\t" - "jg 1b \n\t" - "add %%rdi, %0 \n\t" - "add %%rdi, %%rax \n\t" - "add %%rdi, %%r9 \n\t" - "add %%rdi, %%r10 \n\t" - "dec %%ebx \n\t" - "jg 0b \n\t" - "vbroadcastss (%6), %%zmm4 \n\t" - "vdivps %%zmm4, %%zmm0, %%zmm0 \n\t" - "vdivps %%zmm4, %%zmm1, %%zmm1 \n\t" - "vdivps %%zmm4, %%zmm2, %%zmm2 \n\t" - "vdivps %%zmm4, %%zmm3, %%zmm3 \n\t" - "vmovups %%zmm0, (%1) \n\t" - "vmovups %%zmm1, 0x40(%1) \n\t" - "vmovups %%zmm2, 0x80(%1) \n\t" - "vmovups %%zmm3, 0xC0(%1) \n\t" - : - : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize) - : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%zmm0", "%zmm1", "%zmm2", "%zmm3", - "%zmm4", "%zmm5", "%zmm6", "%zmm7", "memory", "cc"); -} - -void pooling_c16_mean_w2( - const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize) -{ - __asm__ __volatile__( - "mov %%eax, %%eax \n\t" - "mov %4, %%eax \n\t" - "mov %%rax, %%rdi \n\t" - "mov %5, %%eax \n\t" - "add %0, %%rax \n\t" - "vxorps %%zmm0, %%zmm0, %%zmm0 \n\t" - "vxorps %%zmm1, %%zmm1, %%zmm1 \n\t" - ".align 16 \n\t" - "0: \n\t" - "mov %2, %%ecx \n\t" - ".align 16 \n\t" - "1: \n\t" - "vmovups (%0), %%zmm4 \n\t" - "vmovups (%%rax), %%zmm5 \n\t" - "vaddps %%zmm0, %%zmm4, %%zmm0 \n\t" - "vaddps %%zmm1, %%zmm5, %%zmm1 \n\t" - "add $0x40, %0 \n\t" - "add $0x40, %%rax \n\t" - "dec %%ecx \n\t" - "jg 1b \n\t" - "add %%rdi, %0 \n\t" - "add %%rdi, %%rax \n\t" - "dec %%ebx \n\t" - "jg 0b \n\t" - "vbroadcastss (%6), %%zmm4 \n\t" - "vdivps %%zmm4, %%zmm0, %%zmm0 \n\t" - "vdivps %%zmm4, %%zmm1, %%zmm1 \n\t" - "vmovups %%zmm0, (%1) \n\t" - "vmovups %%zmm1, 0x40(%1) \n\t" - : - : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize) - : "%eax", "%rax", "%ecx", "%rdi", "%zmm0", "%zmm1", "%zmm4", "%zmm5", "memory", "cc"); -} - -void pooling_c16_mean_w1( - const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize) -{ - __asm__ __volatile__( - "mov %%eax, %%eax \n\t" - "mov %4, %%eax \n\t" - "mov %%rax, %%rdi \n\t" - "vxorps %%zmm0, %%zmm0, %%zmm0 \n\t" - ".align 16 \n\t" - "0: \n\t" - "mov %2, %%ecx \n\t" - ".align 16 \n\t" - "1: \n\t" - "vmovups (%0), %%zmm4 \n\t" - "vaddps %%zmm0, %%zmm4, %%zmm0 \n\t" - "add $0x40, %0 \n\t" - "dec %%ecx \n\t" - "jg 1b \n\t" - "add %%rdi, %0 \n\t" - "dec %%ebx \n\t" - "jg 0b \n\t" - "vbroadcastss (%6), %%zmm4 \n\t" - "vdivps %%zmm4, %%zmm0, %%zmm0 \n\t" - "vmovups %%zmm0, (%1) \n\t" - : - : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize) - : "%eax", "%rax", "%ecx", "%rdi", "%zmm0", "%zmm4", "memory", "cc"); -} - -EE pooling_c16_fp32(TensorDesc inputDesc, - const F32 *input, - PoolingParamSpec poolingParamSpec, - TensorDesc outputDesc, - F32 *output) +EE pooling_c16_fp32( + TensorDesc inputDesc, const F32 *input, PoolingParamSpec p, TensorDesc outputDesc, F32 *output) { if (nullptr == input || nullptr == output) { CHECK_STATUS(NULL_POINTER); @@ -291,31 +42,33 @@ EE pooling_c16_fp32(TensorDesc inputDesc, CHECK_STATUS(NOT_MATCH); } - PoolingMode pm = poolingParamSpec.mode; - U32 strideH = poolingParamSpec.stride_h; - U32 strideW = poolingParamSpec.stride_w; - U32 paddingT = poolingParamSpec.padding_top; - U32 paddingL = poolingParamSpec.padding_left; - U32 kernelSizeH = poolingParamSpec.kernel_h; - U32 kernelSizeW = poolingParamSpec.kernel_w; + PoolingMode pm = p.mode; + U32 strideH = p.stride_h; + U32 strideW = p.stride_w; + U32 paddingT = p.pad_top; + U32 paddingL = p.pad_left; + U32 kernelSizeH = p.kernel_h; + U32 kernelSizeW = p.kernel_w; U32 wSize, kh, kw, iStep; - F32 poolSize, *curO; + F32 *curO; const F32 *curI; if (paddingT >= kernelSizeH || paddingL >= kernelSizeW) { CHECK_STATUS(NOT_SUPPORTED); } - if (ic % 16 != 0) { - CHECK_STATUS(NOT_MATCH); - } - ic /= 16; U32 owInter = (iw + paddingL - kernelSizeW) / strideW + 1; U32 wSizes[3] = {1, 2, 4}; - pooling_max_func pooling_max[3] = {pooling_c16_max_w1, pooling_c16_max_w2, pooling_c16_max_w4}; - pooling_mean_func pooling_mean[3] = { - pooling_c16_mean_w1, pooling_c16_mean_w2, pooling_c16_mean_w4}; + pooling_max_func pooling_max[2][3] = {{pooling_max_w1, pooling_max_w2, pooling_max_w4}, + {pooling_c16_max_w1, pooling_c16_max_w2, pooling_c16_max_w4}}; + pooling_mean_func pooling_mean[2][3] = {{pooling_mean_w1, pooling_mean_w2, pooling_mean_w4}, + {pooling_c16_mean_w1, pooling_c16_mean_w2, pooling_c16_mean_w4}}; + F32 poolSize = kernelSizeH * kernelSizeW; for (U32 n = 0; n < in; n++) { - for (U32 c = 0; c < ic; c++) { + for (U32 c = 0; c < ic; c += 16) { + U32 cx = 16; + if (c + 16 > ic) { + cx = 8; + } for (U32 h = 0; h < oh; h++) { for (U32 w = 0; w < ow; w += wSize) { if (w < owInter) { @@ -331,23 +84,26 @@ EE pooling_c16_fp32(TensorDesc inputDesc, hstart = UNI_MAX(hstart, 0); wstart = UNI_MAX(wstart, 0); - curI = input + (hstart * iw + wstart) * 16; - curO = output + (h * ow + w) * 16; + curI = input + (hstart * iw + wstart) * cx; + curO = output + (h * ow + w) * cx; kh = hend - hstart; kw = wend - wstart; - iStep = (iw - kw) * 64; - poolSize = kw * kh * 1.0f; + iStep = (iw - kw) * cx * 4; + if (!p.count_include_pad) { + poolSize = kh * kw; + } if (kw < kernelSizeW) { wSize = 1; } switch (pm) { case POOLING_MAX: { - pooling_max[wSize >> 1](curI, curO, kw, kh, iStep, strideW * 64); + pooling_max[cx >> 4][wSize >> 1]( + curI, curO, kw, kh, iStep, strideW * cx * 4); break; } case POOLING_MEAN: { - pooling_mean[wSize >> 1]( - curI, curO, kw, kh, iStep, strideW * 64, poolSize); + pooling_mean[cx >> 4][wSize >> 1]( + curI, curO, kw, kh, iStep, strideW * cx * 4, poolSize); break; } default: @@ -355,8 +111,8 @@ EE pooling_c16_fp32(TensorDesc inputDesc, } } } - input += ih * iw * 16; - output += oh * ow * 16; + input += ih * iw * cx; + output += oh * ow * cx; } } return SUCCESS; diff --git a/compute/tensor/src/cpu/x86/fp32/pooling_bp.cpp b/compute/tensor/src/cpu/x86/fp32/pooling_bp.cpp index c866f86e..4ae13a19 100644 --- a/compute/tensor/src/cpu/x86/fp32/pooling_bp.cpp +++ b/compute/tensor/src/cpu/x86/fp32/pooling_bp.cpp @@ -15,13 +15,27 @@ #define UNROLL_W 4 -typedef void (*pooling_bp_func)( - const F32 *input, int hstart, int hend, int wstart, int wend, F32 *output, U32 ow, U32 strideW); +typedef void (*pooling_bp_func)(const F32 *input, + int hstart, + int hend, + int wstart, + int wend, + int pool, + F32 *output, + U32 ow, + U32 strideW); -void pooling_bp_c8_w4_fp32( - const F32 *input, int hstart, int hend, int wstart, int wend, F32 *output, U32 ow, U32 strideW) +void pooling_bp_c8_w4_fp32(const F32 *input, + int hstart, + int hend, + int wstart, + int wend, + int pool, + F32 *output, + U32 ow, + U32 strideW) { - __m256 poolSize = _mm256_set1_ps((hend - hstart) * (wend - wstart) * 1.0f); + __m256 poolSize = _mm256_set1_ps(pool); __m256 in0 = _mm256_div_ps(_mm256_loadu_ps(input), poolSize); __m256 in1 = _mm256_div_ps(_mm256_loadu_ps(input + 8), poolSize); __m256 in2 = _mm256_div_ps(_mm256_loadu_ps(input + 16), poolSize); @@ -44,10 +58,17 @@ void pooling_bp_c8_w4_fp32( } } -void pooling_bp_c8_w2_fp32( - const F32 *input, int hstart, int hend, int wstart, int wend, F32 *output, U32 ow, U32 strideW) +void pooling_bp_c8_w2_fp32(const F32 *input, + int hstart, + int hend, + int wstart, + int wend, + int pool, + F32 *output, + U32 ow, + U32 strideW) { - __m256 poolSize = _mm256_set1_ps((hend - hstart) * (wend - wstart) * 1.0f); + __m256 poolSize = _mm256_set1_ps(pool); __m256 in0 = _mm256_div_ps(_mm256_loadu_ps(input), poolSize); __m256 in1 = _mm256_div_ps(_mm256_loadu_ps(input + 8), poolSize); for (int kernelH = hstart; kernelH < hend; kernelH++) { @@ -62,10 +83,17 @@ void pooling_bp_c8_w2_fp32( } } -void pooling_bp_c8_w1_fp32( - const F32 *input, int hstart, int hend, int wstart, int wend, F32 *output, U32 ow, U32 strideW) +void pooling_bp_c8_w1_fp32(const F32 *input, + int hstart, + int hend, + int wstart, + int wend, + int pool, + F32 *output, + U32 ow, + U32 strideW) { - __m256 poolSize = _mm256_set1_ps((hend - hstart) * (wend - wstart) * 1.0f); + __m256 poolSize = _mm256_set1_ps(pool); __m256 in0 = _mm256_div_ps(_mm256_loadu_ps(input), poolSize); for (int kernelH = hstart; kernelH < hend; kernelH++) { for (int kernelW = wstart; kernelW < wend; kernelW++) { @@ -98,7 +126,7 @@ EE pooling_bp_fp32( if (idf != DF_NCHWC8 || odf != idf) { ret = NOT_MATCH; } - if (p.padding_top >= p.kernel_h || p.padding_left >= p.kernel_w) { + if (p.pad_top >= p.kernel_h || p.pad_left >= p.kernel_w) { ret = NOT_SUPPORTED; } PoolingMode pm = p.mode; @@ -108,11 +136,12 @@ EE pooling_bp_fp32( ic /= 8; U32 wSize = 0; - U32 iwInter = (ow + p.padding_left - p.kernel_w) / p.stride_w + 1; + U32 iwInter = (ow + p.pad_left - p.kernel_w) / p.stride_w + 1; const F32 *curI = input; F32 *curO = output; pooling_bp_func pooling_bp[3] = { pooling_bp_c8_w1_fp32, pooling_bp_c8_w2_fp32, pooling_bp_c8_w4_fp32}; + int poolSize = p.kernel_t * p.kernel_h * p.kernel_w; for (U32 n = 0; n < in; n++) { for (U32 c = 0; c < ic; c++) { for (U32 h = 0; h < ih; h++) { @@ -122,8 +151,8 @@ EE pooling_bp_fp32( } else { wSize = 1; } - int hstart = (int)h * (int)p.stride_h - (int)p.padding_top; - int wstart = (int)w * (int)p.stride_w - (int)p.padding_left; + int hstart = (int)h * (int)p.stride_h - (int)p.pad_top; + int wstart = (int)w * (int)p.stride_w - (int)p.pad_left; int hend = UNI_MIN(hstart + p.kernel_h, oh); int wend = UNI_MIN(wstart + p.kernel_w, ow); hstart = UNI_MAX(hstart, 0); @@ -131,7 +160,11 @@ EE pooling_bp_fp32( if (wend < wstart + (int)p.kernel_w) { wSize = 1; } - pooling_bp[wSize >> 1](curI, hstart, hend, wstart, wend, curO, ow, p.stride_w); + if (!p.count_include_pad) { + poolSize = (hend - hstart) * (wend - wstart); + } + pooling_bp[wSize >> 1]( + curI, hstart, hend, wstart, wend, poolSize, curO, ow, p.stride_w); curI += wSize * 8; } } @@ -139,4 +172,4 @@ EE pooling_bp_fp32( } } return ret; -} \ No newline at end of file +} diff --git a/compute/tensor/src/cpu/x86/fp32/pooling_kernel.h b/compute/tensor/src/cpu/x86/fp32/pooling_kernel.h new file mode 100644 index 00000000..7f838dca --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/pooling_kernel.h @@ -0,0 +1,508 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_POOLING_KERNEL +#define _H_POOLING_KERNEL + +inline void pooling_max_w4(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) +{ + __asm__ __volatile__("mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov %%eax, %%eax \n\t" + "mov %5, %%eax \n\t" + "mov %%rax, %%r9 \n\t" + "add %%r9, %%r9 \n\t" + "mov %%rax, %%r10 \n\t" + "add %%r9, %%r10 \n\t" + "add %0, %%rax \n\t" + "add %0, %%r9 \n\t" + "add %0, %%r10 \n\t" + + "vmovups (%0), %%ymm0 \n\t" + "vmovups (%%rax), %%ymm1 \n\t" + "vmovups (%%r9), %%ymm2 \n\t" + "vmovups (%%r10), %%ymm3 \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%0), %%ymm4 \n\t" + "vmovups (%%rax), %%ymm5 \n\t" + "vmovups (%%r9), %%ymm6 \n\t" + "vmovups (%%r10), %%ymm7 \n\t" + + "vmaxps %%ymm0, %%ymm4, %%ymm0 \n\t" + "vmaxps %%ymm1, %%ymm5, %%ymm1 \n\t" + "vmaxps %%ymm2, %%ymm6, %%ymm2 \n\t" + "vmaxps %%ymm3, %%ymm7, %%ymm3 \n\t" + + "add $0x20, %0 \n\t" + "add $0x20, %%rax \n\t" + "add $0x20, %%r9 \n\t" + "add $0x20, %%r10 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + "add %%rdi, %0 \n\t" + "add %%rdi, %%rax \n\t" + "add %%rdi, %%r9 \n\t" + "add %%rdi, %%r10 \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + "vmovups %%ymm2, 0x40(%1) \n\t" + "vmovups %%ymm3, 0x60(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) + : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%ymm0", "%ymm1", "%ymm2", + "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "memory", "cc"); +} + +inline void pooling_max_w2(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) +{ + __asm__ __volatile__( + "mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov %%eax, %%eax \n\t" + "mov %5, %%eax \n\t" + "add %0, %%rax \n\t" + "vmovups (%0), %%ymm0 \n\t" + "vmovups (%%rax), %%ymm1 \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + "vmovups (%0), %%ymm4 \n\t" + "vmovups (%%rax), %%ymm5 \n\t" + "vmaxps %%ymm0, %%ymm4, %%ymm0 \n\t" + "vmaxps %%ymm1, %%ymm5, %%ymm1 \n\t" + "add $0x20, %0 \n\t" + "add $0x20, %%rax \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + "add %%rdi, %0 \n\t" + "add %%rdi, %%rax \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) + : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm1", "%ymm4", "%ymm5", "memory", "cc"); +} + +inline void pooling_max_w1(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) +{ + __asm__ __volatile__("mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "vmovups (%0), %%ymm0 \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + "vmovups (%0), %%ymm4 \n\t" + "vmaxps %%ymm0, %%ymm4, %%ymm0 \n\t" + "add $0x20, %0 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + "add %%rdi, %0 \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + "vmovups %%ymm0, (%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) + : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm4", "memory", "cc"); +} + +inline void pooling_mean_w4(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize) +{ + __asm__ __volatile__( + "mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov %5, %%eax \n\t" + "mov %%rax, %%r9 \n\t" + "add %%r9, %%r9 \n\t" + "mov %%rax, %%r10 \n\t" + "add %%r9, %%r10 \n\t" + "add %0, %%rax \n\t" + "add %0, %%r9 \n\t" + "add %0, %%r10 \n\t" + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorps %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorps %%ymm3, %%ymm3, %%ymm3 \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + "vmovups (%0), %%ymm4 \n\t" + "vmovups (%%rax), %%ymm5 \n\t" + "vmovups (%%r9), %%ymm6 \n\t" + "vmovups (%%r10), %%ymm7 \n\t" + "vaddps %%ymm0, %%ymm4, %%ymm0 \n\t" + "vaddps %%ymm1, %%ymm5, %%ymm1 \n\t" + "vaddps %%ymm2, %%ymm6, %%ymm2 \n\t" + "vaddps %%ymm3, %%ymm7, %%ymm3 \n\t" + "add $0x20, %0 \n\t" + "add $0x20, %%rax \n\t" + "add $0x20, %%r9 \n\t" + "add $0x20, %%r10 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + "add %%rdi, %0 \n\t" + "add %%rdi, %%rax \n\t" + "add %%rdi, %%r9 \n\t" + "add %%rdi, %%r10 \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + "vbroadcastss (%6), %%ymm4 \n\t" + "vdivps %%ymm4, %%ymm0, %%ymm0 \n\t" + "vdivps %%ymm4, %%ymm1, %%ymm1 \n\t" + "vdivps %%ymm4, %%ymm2, %%ymm2 \n\t" + "vdivps %%ymm4, %%ymm3, %%ymm3 \n\t" + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + "vmovups %%ymm2, 0x40(%1) \n\t" + "vmovups %%ymm3, 0x60(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize) + : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%ymm0", "%ymm1", "%ymm2", "%ymm3", + "%ymm4", "%ymm5", "%ymm6", "%ymm7", "memory", "cc"); +} + +inline void pooling_mean_w2(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize) +{ + __asm__ __volatile__( + "mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov %5, %%eax \n\t" + "add %0, %%rax \n\t" + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + "vmovups (%0), %%ymm4 \n\t" + "vmovups (%%rax), %%ymm5 \n\t" + "vaddps %%ymm0, %%ymm4, %%ymm0 \n\t" + "vaddps %%ymm1, %%ymm5, %%ymm1 \n\t" + "add $0x20, %0 \n\t" + "add $0x20, %%rax \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + "add %%rdi, %0 \n\t" + "add %%rdi, %%rax \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + "vbroadcastss (%6), %%ymm4 \n\t" + "vdivps %%ymm4, %%ymm0, %%ymm0 \n\t" + "vdivps %%ymm4, %%ymm1, %%ymm1 \n\t" + "vmovups %%ymm0, (%1) \n\t" + "vmovups %%ymm1, 0x20(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize) + : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm1", "%ymm4", "%ymm5", "memory", "cc"); +} + +inline void pooling_mean_w1(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize) +{ + __asm__ __volatile__( + "mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + "vmovups (%0), %%ymm4 \n\t" + "vaddps %%ymm0, %%ymm4, %%ymm0 \n\t" + "add $0x20, %0 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + "add %%rdi, %0 \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + "vbroadcastss (%6), %%ymm4 \n\t" + "vdivps %%ymm4, %%ymm0, %%ymm0 \n\t" + "vmovups %%ymm0, (%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize) + : "%eax", "%rax", "%ecx", "%rdi", "%ymm0", "%ymm4", "memory", "cc"); +} + +inline void pooling_c16_max_w4(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) +{ + __asm__ __volatile__("mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov %%eax, %%eax \n\t" + "mov %5, %%eax \n\t" + "mov %%rax, %%r9 \n\t" + "add %%r9, %%r9 \n\t" + "mov %%rax, %%r10 \n\t" + "add %%r9, %%r10 \n\t" + "add %0, %%rax \n\t" + "add %0, %%r9 \n\t" + "add %0, %%r10 \n\t" + + "vmovups (%0), %%zmm0 \n\t" + "vmovups (%%rax), %%zmm1 \n\t" + "vmovups (%%r9), %%zmm2 \n\t" + "vmovups (%%r10), %%zmm3 \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%0), %%zmm4 \n\t" + "vmovups (%%rax), %%zmm5 \n\t" + "vmovups (%%r9), %%zmm6 \n\t" + "vmovups (%%r10), %%zmm7 \n\t" + + "vmaxps %%zmm0, %%zmm4, %%zmm0 \n\t" + "vmaxps %%zmm1, %%zmm5, %%zmm1 \n\t" + "vmaxps %%zmm2, %%zmm6, %%zmm2 \n\t" + "vmaxps %%zmm3, %%zmm7, %%zmm3 \n\t" + + "add $0x40, %0 \n\t" + "add $0x40, %%rax \n\t" + "add $0x40, %%r9 \n\t" + "add $0x40, %%r10 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + "add %%rdi, %0 \n\t" + "add %%rdi, %%rax \n\t" + "add %%rdi, %%r9 \n\t" + "add %%rdi, %%r10 \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + + "vmovups %%zmm0, (%1) \n\t" + "vmovups %%zmm1, 0x40(%1) \n\t" + "vmovups %%zmm2, 0x80(%1) \n\t" + "vmovups %%zmm3, 0xC0(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) + : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%zmm0", "%zmm1", "%zmm2", + "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "memory", "cc"); +} + +inline void pooling_c16_max_w2(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) +{ + __asm__ __volatile__( + "mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov %%eax, %%eax \n\t" + "mov %5, %%eax \n\t" + "add %0, %%rax \n\t" + "vmovups (%0), %%zmm0 \n\t" + "vmovups (%%rax), %%zmm1 \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + "vmovups (%0), %%zmm4 \n\t" + "vmovups (%%rax), %%zmm5 \n\t" + "vmaxps %%zmm0, %%zmm4, %%zmm0 \n\t" + "vmaxps %%zmm1, %%zmm5, %%zmm1 \n\t" + "add $0x40, %0 \n\t" + "add $0x40, %%rax \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + "add %%rdi, %0 \n\t" + "add %%rdi, %%rax \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + "vmovups %%zmm0, (%1) \n\t" + "vmovups %%zmm1, 0x40(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) + : "%eax", "%rax", "%ecx", "%rdi", "%zmm0", "%zmm1", "%zmm4", "%zmm5", "memory", "cc"); +} + +inline void pooling_c16_max_w1(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) +{ + __asm__ __volatile__("mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "vmovups (%0), %%zmm0 \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + "vmovups (%0), %%zmm4 \n\t" + "vmaxps %%zmm0, %%zmm4, %%zmm0 \n\t" + "add $0x40, %0 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + "add %%rdi, %0 \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + "vmovups %%zmm0, (%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) + : "%eax", "%rax", "%ecx", "%rdi", "%zmm0", "%zmm4", "memory", "cc"); +} + +inline void pooling_c16_mean_w4( + const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize) +{ + __asm__ __volatile__( + "mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov %5, %%eax \n\t" + "mov %%rax, %%r9 \n\t" + "add %%r9, %%r9 \n\t" + "mov %%rax, %%r10 \n\t" + "add %%r9, %%r10 \n\t" + "add %0, %%rax \n\t" + "add %0, %%r9 \n\t" + "add %0, %%r10 \n\t" + "vxorps %%zmm0, %%zmm0, %%zmm0 \n\t" + "vxorps %%zmm1, %%zmm1, %%zmm1 \n\t" + "vxorps %%zmm2, %%zmm2, %%zmm2 \n\t" + "vxorps %%zmm3, %%zmm3, %%zmm3 \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + "vmovups (%0), %%zmm4 \n\t" + "vmovups (%%rax), %%zmm5 \n\t" + "vmovups (%%r9), %%zmm6 \n\t" + "vmovups (%%r10), %%zmm7 \n\t" + "vaddps %%zmm0, %%zmm4, %%zmm0 \n\t" + "vaddps %%zmm1, %%zmm5, %%zmm1 \n\t" + "vaddps %%zmm2, %%zmm6, %%zmm2 \n\t" + "vaddps %%zmm3, %%zmm7, %%zmm3 \n\t" + "add $0x40, %0 \n\t" + "add $0x40, %%rax \n\t" + "add $0x40, %%r9 \n\t" + "add $0x40, %%r10 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + "add %%rdi, %0 \n\t" + "add %%rdi, %%rax \n\t" + "add %%rdi, %%r9 \n\t" + "add %%rdi, %%r10 \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + "vbroadcastss (%6), %%zmm4 \n\t" + "vdivps %%zmm4, %%zmm0, %%zmm0 \n\t" + "vdivps %%zmm4, %%zmm1, %%zmm1 \n\t" + "vdivps %%zmm4, %%zmm2, %%zmm2 \n\t" + "vdivps %%zmm4, %%zmm3, %%zmm3 \n\t" + "vmovups %%zmm0, (%1) \n\t" + "vmovups %%zmm1, 0x40(%1) \n\t" + "vmovups %%zmm2, 0x80(%1) \n\t" + "vmovups %%zmm3, 0xC0(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize) + : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%zmm0", "%zmm1", "%zmm2", "%zmm3", + "%zmm4", "%zmm5", "%zmm6", "%zmm7", "memory", "cc"); +} + +inline void pooling_c16_mean_w2( + const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize) +{ + __asm__ __volatile__( + "mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov %5, %%eax \n\t" + "add %0, %%rax \n\t" + "vxorps %%zmm0, %%zmm0, %%zmm0 \n\t" + "vxorps %%zmm1, %%zmm1, %%zmm1 \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + "vmovups (%0), %%zmm4 \n\t" + "vmovups (%%rax), %%zmm5 \n\t" + "vaddps %%zmm0, %%zmm4, %%zmm0 \n\t" + "vaddps %%zmm1, %%zmm5, %%zmm1 \n\t" + "add $0x40, %0 \n\t" + "add $0x40, %%rax \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + "add %%rdi, %0 \n\t" + "add %%rdi, %%rax \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + "vbroadcastss (%6), %%zmm4 \n\t" + "vdivps %%zmm4, %%zmm0, %%zmm0 \n\t" + "vdivps %%zmm4, %%zmm1, %%zmm1 \n\t" + "vmovups %%zmm0, (%1) \n\t" + "vmovups %%zmm1, 0x40(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize) + : "%eax", "%rax", "%ecx", "%rdi", "%zmm0", "%zmm1", "%zmm4", "%zmm5", "memory", "cc"); +} + +inline void pooling_c16_mean_w1( + const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize) +{ + __asm__ __volatile__( + "mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "vxorps %%zmm0, %%zmm0, %%zmm0 \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + "vmovups (%0), %%zmm4 \n\t" + "vaddps %%zmm0, %%zmm4, %%zmm0 \n\t" + "add $0x40, %0 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + "add %%rdi, %0 \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + "vbroadcastss (%6), %%zmm4 \n\t" + "vdivps %%zmm4, %%zmm0, %%zmm0 \n\t" + "vmovups %%zmm0, (%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize) + : "%eax", "%rax", "%ecx", "%rdi", "%zmm0", "%zmm4", "memory", "cc"); +} + +#endif \ No newline at end of file diff --git a/compute/tensor/src/cpu/x86/fp32/pooling_nchw.cpp b/compute/tensor/src/cpu/x86/fp32/pooling_nchw.cpp new file mode 100644 index 00000000..8776f591 --- /dev/null +++ b/compute/tensor/src/cpu/x86/fp32/pooling_nchw.cpp @@ -0,0 +1,337 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +#define UNROLL_W 32 + +typedef void (*pooling_max_func)(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride); +typedef void (*pooling_mean_func)( + const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize); + +void pooling_max_w32(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) +{ + __m256 x1, x2, x3, x4; + if (stride == 1) { + x1 = _mm256_loadu_ps(curI); + x2 = _mm256_loadu_ps(curI + 8); + x3 = _mm256_loadu_ps(curI + 16); + x4 = _mm256_loadu_ps(curI + 24); + for (U32 h = 0; h < kh; ++h) { + for (U32 w = 0; w < kw; ++w) { + x1 = _mm256_max_ps(x1, _mm256_loadu_ps(curI)); + x2 = _mm256_max_ps(x2, _mm256_loadu_ps(curI + 8)); + x3 = _mm256_max_ps(x3, _mm256_loadu_ps(curI + 16)); + x4 = _mm256_max_ps(x4, _mm256_loadu_ps(curI + 24)); + curI += 1; + } + curI += iStep; + } + } else { + __m256i v256index = _mm256_set_epi32( + stride * 7, stride * 6, stride * 5, stride * 4, stride * 3, stride * 2, stride, 0); + x1 = _mm256_i32gather_ps(curI, v256index, 4); + x2 = _mm256_i32gather_ps(curI + 8 * stride, v256index, 4); + x3 = _mm256_i32gather_ps(curI + 16 * stride, v256index, 4); + x4 = _mm256_i32gather_ps(curI + 24 * stride, v256index, 4); + for (U32 h = 0; h < kh; ++h) { + for (U32 w = 0; w < kw; ++w) { + x1 = _mm256_max_ps(x1, _mm256_i32gather_ps(curI, v256index, 4)); + x2 = _mm256_max_ps(x2, _mm256_i32gather_ps(curI + 8 * stride, v256index, 4)); + x3 = _mm256_max_ps(x3, _mm256_i32gather_ps(curI + 16 * stride, v256index, 4)); + x4 = _mm256_max_ps(x4, _mm256_i32gather_ps(curI + 24 * stride, v256index, 4)); + curI += 1; + } + curI += iStep; + } + } + _mm256_storeu_ps(curO, x1); + _mm256_storeu_ps(curO + 8, x2); + _mm256_storeu_ps(curO + 16, x3); + _mm256_storeu_ps(curO + 24, x4); +} + +void pooling_max_w16(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) +{ + __m256 x1, x2; + if (stride == 1) { + x1 = _mm256_loadu_ps(curI); + x2 = _mm256_loadu_ps(curI + 8); + for (U32 h = 0; h < kh; ++h) { + for (U32 w = 0; w < kw; ++w) { + x1 = _mm256_max_ps(x1, _mm256_loadu_ps(curI)); + x2 = _mm256_max_ps(x2, _mm256_loadu_ps(curI + 8)); + curI += 1; + } + curI += iStep; + } + } else { + __m256i v256index = _mm256_set_epi32( + stride * 7, stride * 6, stride * 5, stride * 4, stride * 3, stride * 2, stride, 0); + x1 = _mm256_i32gather_ps(curI, v256index, 4); + x2 = _mm256_i32gather_ps(curI + 8 * stride, v256index, 4); + for (U32 h = 0; h < kh; ++h) { + for (U32 w = 0; w < kw; ++w) { + x1 = _mm256_max_ps(x1, _mm256_i32gather_ps(curI, v256index, 4)); + x2 = _mm256_max_ps(x2, _mm256_i32gather_ps(curI + 8 * stride, v256index, 4)); + curI += 1; + } + curI += iStep; + } + } + _mm256_storeu_ps(curO, x1); + _mm256_storeu_ps(curO + 8, x2); +} + +void pooling_max_w8(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) +{ + __m256 x1; + if (stride == 1) { + x1 = _mm256_loadu_ps(curI); + for (U32 h = 0; h < kh; ++h) { + for (U32 w = 0; w < kw; ++w) { + x1 = _mm256_max_ps(x1, _mm256_loadu_ps(curI)); + curI += 1; + } + curI += iStep; + } + } else { + __m256i v256index = _mm256_set_epi32( + stride * 7, stride * 6, stride * 5, stride * 4, stride * 3, stride * 2, stride, 0); + x1 = _mm256_i32gather_ps(curI, v256index, 4); + for (U32 h = 0; h < kh; ++h) { + for (U32 w = 0; w < kw; ++w) { + x1 = _mm256_max_ps(x1, _mm256_i32gather_ps(curI, v256index, 4)); + curI += 1; + } + curI += iStep; + } + } + _mm256_storeu_ps(curO, x1); +} + +void pooling_max_w0(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) +{ + *curO = *curI; + for (U32 h = 0; h < kh; ++h) { + for (U32 w = 0; w < kw; ++w) { + *curO = UNI_MAX(*curO, *curI); + curI += 1; + } + curI += iStep; + } +} + +void pooling_mean_w32( + const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize) +{ + __m256 x0 = _mm256_set1_ps(1.0f / poolSize); + __m256 x1 = _mm256_setzero_ps(); + __m256 x2 = _mm256_setzero_ps(); + __m256 x3 = _mm256_setzero_ps(); + __m256 x4 = _mm256_setzero_ps(); + + if (stride == 1) { + for (U32 h = 0; h < kh; ++h) { + for (U32 w = 0; w < kw; ++w) { + x1 = _mm256_add_ps(x1, _mm256_loadu_ps(curI)); + x2 = _mm256_add_ps(x2, _mm256_loadu_ps(curI + 8)); + x3 = _mm256_add_ps(x3, _mm256_loadu_ps(curI + 16)); + x4 = _mm256_add_ps(x4, _mm256_loadu_ps(curI + 24)); + curI += 1; + } + curI += iStep; + } + } else { + __m256i v256index = _mm256_set_epi32( + stride * 7, stride * 6, stride * 5, stride * 4, stride * 3, stride * 2, stride, 0); + for (U32 h = 0; h < kh; ++h) { + for (U32 w = 0; w < kw; ++w) { + x1 = _mm256_add_ps(x1, _mm256_i32gather_ps(curI, v256index, 4)); + x2 = _mm256_add_ps(x2, _mm256_i32gather_ps(curI + 8 * stride, v256index, 4)); + x3 = _mm256_add_ps(x3, _mm256_i32gather_ps(curI + 16 * stride, v256index, 4)); + x4 = _mm256_add_ps(x4, _mm256_i32gather_ps(curI + 24 * stride, v256index, 4)); + curI += 1; + } + curI += iStep; + } + } + _mm256_storeu_ps(curO, _mm256_mul_ps(x1, x0)); + _mm256_storeu_ps(curO + 8, _mm256_mul_ps(x2, x0)); + _mm256_storeu_ps(curO + 16, _mm256_mul_ps(x3, x0)); + _mm256_storeu_ps(curO + 24, _mm256_mul_ps(x4, x0)); +} + +void pooling_mean_w16( + const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize) +{ + __m256 x0 = _mm256_set1_ps(1.0f / poolSize); + __m256 x1 = _mm256_setzero_ps(); + __m256 x2 = _mm256_setzero_ps(); + + if (stride == 1) { + for (U32 h = 0; h < kh; ++h) { + for (U32 w = 0; w < kw; ++w) { + x1 = _mm256_add_ps(x1, _mm256_loadu_ps(curI)); + x2 = _mm256_add_ps(x2, _mm256_loadu_ps(curI + 8)); + curI += 1; + } + curI += iStep; + } + } else { + __m256i v256index = _mm256_set_epi32( + stride * 7, stride * 6, stride * 5, stride * 4, stride * 3, stride * 2, stride, 0); + for (U32 h = 0; h < kh; ++h) { + for (U32 w = 0; w < kw; ++w) { + x1 = _mm256_add_ps(x1, _mm256_i32gather_ps(curI, v256index, 4)); + x2 = _mm256_add_ps(x2, _mm256_i32gather_ps(curI + 8 * stride, v256index, 4)); + curI += 1; + } + curI += iStep; + } + } + _mm256_storeu_ps(curO, _mm256_mul_ps(x1, x0)); + _mm256_storeu_ps(curO + 8, _mm256_mul_ps(x2, x0)); +} + +void pooling_mean_w8(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize) +{ + __m256 x0 = _mm256_set1_ps(1.0f / poolSize); + __m256 x1 = _mm256_setzero_ps(); + + if (stride == 1) { + for (U32 h = 0; h < kh; ++h) { + for (U32 w = 0; w < kw; ++w) { + x1 = _mm256_add_ps(x1, _mm256_loadu_ps(curI)); + curI += 1; + } + curI += iStep; + } + } else { + __m256i v256index = _mm256_set_epi32( + stride * 7, stride * 6, stride * 5, stride * 4, stride * 3, stride * 2, stride, 0); + for (U32 h = 0; h < kh; ++h) { + for (U32 w = 0; w < kw; ++w) { + x1 = _mm256_add_ps(x1, _mm256_i32gather_ps(curI, v256index, 4)); + curI += 1; + } + curI += iStep; + } + } + _mm256_storeu_ps(curO, _mm256_mul_ps(x1, x0)); +} + +void pooling_mean_w0(const F32 *curI, F32 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, F32 poolSize) +{ + *curO = 0; + for (U32 h = 0; h < kh; ++h) { + for (U32 w = 0; w < kw; ++w) { + *curO += *curI; + curI += 1; + } + curI += iStep; + } + *curO /= poolSize; +} + +EE pooling_nchw_fp32( + TensorDesc inputDesc, const F32 *input, PoolingParamSpec p, TensorDesc outputDesc, F32 *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (idt != odt || idt != DT_F32) { + CHECK_STATUS(NOT_MATCH); + } + if (in != on || ic != oc) { + CHECK_STATUS(NOT_MATCH); + } + if (idf != DF_NCHW || odf != idf) { + CHECK_STATUS(NOT_MATCH); + } + + PoolingMode pm = p.mode; + U32 strideH = p.stride_h; + U32 strideW = p.stride_w; + U32 paddingT = p.pad_top; + U32 paddingL = p.pad_left; + U32 kernelSizeH = p.kernel_h; + U32 kernelSizeW = p.kernel_w; + U32 wSize, kh, kw, iStep; + F32 *curO; + const F32 *curI; + if (paddingT >= kernelSizeH || paddingL >= kernelSizeW) { + CHECK_STATUS(NOT_SUPPORTED); + } + + U32 owInter = (iw + paddingL - kernelSizeW) / strideW + 1; + U32 wSizes[5] = {1, 8, 16, 16, 32}; + pooling_max_func pooling_max[5] = { + pooling_max_w0, pooling_max_w8, pooling_max_w16, pooling_max_w16, pooling_max_w32}; + pooling_mean_func pooling_mean[5] = { + pooling_mean_w0, pooling_mean_w8, pooling_mean_w16, pooling_mean_w16, pooling_mean_w32}; + F32 poolSize = kernelSizeH * kernelSizeW; + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < oh; h++) { + int hstart = (int)h * (int)strideH - (int)paddingT; + int hend = UNI_MIN(hstart + kernelSizeH, ih); + hstart = UNI_MAX(hstart, 0); + kh = hend - hstart; + for (U32 w = 0; w < ow; w += wSize) { + if (w < owInter) { + wSize = UNI_MIN(owInter - w, UNROLL_W); + } else { + wSize = 1; + } + wSize = wSizes[wSize >> 3]; + int wstart = (int)w * (int)strideW - (int)paddingL; + int wend = UNI_MIN(wstart + kernelSizeW, iw); + wstart = UNI_MAX(wstart, 0); + + curI = input + (hstart * iw + wstart); + curO = output + (h * ow + w); + kw = wend - wstart; + iStep = iw - kw; + if (!p.count_include_pad) { + poolSize = kh * kw; + } + if (kw < kernelSizeW) { + wSize = 1; + } + switch (pm) { + case POOLING_MAX: { + pooling_max[wSize >> 3](curI, curO, kw, kh, iStep, strideW); + break; + } + case POOLING_MEAN: { + pooling_mean[wSize >> 3](curI, curO, kw, kh, iStep, strideW, poolSize); + break; + } + default: + CHECK_STATUS(NOT_SUPPORTED); + } + } + } + input += ih * iw; + output += oh * ow; + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/fp32/scale.cpp b/compute/tensor/src/cpu/x86/fp32/scale.cpp index f2887fb9..c1d5b830 100644 --- a/compute/tensor/src/cpu/x86/fp32/scale.cpp +++ b/compute/tensor/src/cpu/x86/fp32/scale.cpp @@ -59,29 +59,45 @@ EE scale_nchwc8_fp32( return SUCCESS; } +template EE scale_nchw_fp32( F32 *input, F32 *alpha, F32 *beta, I32 in, I32 ic, I32 elements_per_channel, F32 *output) { __m256 one = _mm256_set1_ps(1.); __m256 zero = _mm256_set1_ps(0.); - U32 index = 0; - for (I32 n = 0; n < in; n++) { - for (I32 c = 0; c < ic; c++) { - __m256 alpha_vec = (alpha == nullptr) ? one : _mm256_set1_ps(alpha[c]); - __m256 beta_vec = (beta == nullptr) ? zero : _mm256_set1_ps(beta[c]); - I32 i = 0; - for (; i < elements_per_channel - 7; i += 8) { - __m256 in_vec = _mm256_loadu_ps(input + index); - __m256 out_vec = _mm256_fmadd_ps(alpha_vec, in_vec, beta_vec); - _mm256_storeu_ps(output + index, out_vec); - index += 8; +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) +#endif + for (int j = 0; j < in * ic; j++) { + int n = j / ic; + int c = j % ic; + //for (I32 n = 0; n < in; n++) { + // for (I32 c = 0; c < ic; c++) { + U32 dst = j * elements_per_channel, src = 0; + __m256 alpha_vec = (alpha == nullptr) ? one : _mm256_set1_ps(alpha[c]); + __m256 beta_vec = (beta == nullptr) ? zero : _mm256_set1_ps(beta[c]); + I32 i = 0; + for (; i < elements_per_channel - 7; i += 8) { + if (icoc_equal) { + src = (n * ic + c) * elements_per_channel + i; + } else { + src = n * elements_per_channel + i; } - for (; i < elements_per_channel; i++) { - float alpha_s = (alpha == nullptr) ? 1 : alpha[c]; - float beta_s = (beta == nullptr) ? 0 : beta[c]; - output[index] = alpha_s * input[index] + beta_s; - index++; + __m256 in_vec = _mm256_loadu_ps(input + src); + __m256 out_vec = _mm256_fmadd_ps(alpha_vec, in_vec, beta_vec); + _mm256_storeu_ps(output + dst, out_vec); + dst += 8; + } + for (; i < elements_per_channel; i++) { + if (icoc_equal) { + src = (n * ic + c) * elements_per_channel + i; + } else { + src = n * elements_per_channel + i; } + float alpha_s = (alpha == nullptr) ? 1 : alpha[c]; + float beta_s = (beta == nullptr) ? 0 : beta[c]; + output[dst] = alpha_s * input[src] + beta_s; + dst++; } } return SUCCESS; @@ -114,6 +130,7 @@ EE scale_nhwc_fp32( for (; c < ic; c++) { float alpha_s = (alpha == nullptr) ? 1 : alpha[c]; float beta_s = (beta == nullptr) ? 0 : beta[c]; + float in_s; if (icoc_equal) { in_s = input[dst]; } else { @@ -143,14 +160,18 @@ EE scale_fp32(F32 *input, } EE ret = SUCCESS; // If oc is 1, it means that weights/vectors have only one param, so we need use the calculation logic of nchw. - if (axis == 1 || axis == 0 || oc == 1) { - ret = scale_nchw_fp32(input, alpha, beta, on, oc, elements_per_channel, output); - } else if (axis == nDims - 1) { + if (axis == nDims - 1) { if (ic == oc) { ret = scale_nhwc_fp32(input, alpha, beta, on, oc, elements_per_channel, output); } else { ret = scale_nhwc_fp32(input, alpha, beta, on, oc, elements_per_channel, output); } + } else if (axis == 1 || axis == 0 || oc == 1) { + if (ic == oc) { + ret = scale_nchw_fp32(input, alpha, beta, on, oc, elements_per_channel, output); + } else { + ret = scale_nchw_fp32(input, alpha, beta, on, oc, elements_per_channel, output); + } } else if (axis == nDims) { ret = scale_nchwc8_fp32(input, alpha, beta, on, oc, elements_per_channel, output); #ifdef _USE_INT8 diff --git a/compute/tensor/src/cpu/x86/fp32/softmax.cpp b/compute/tensor/src/cpu/x86/fp32/softmax.cpp index 3edd624a..71fffc87 100644 --- a/compute/tensor/src/cpu/x86/fp32/softmax.cpp +++ b/compute/tensor/src/cpu/x86/fp32/softmax.cpp @@ -14,59 +14,76 @@ #include "cpu/x86/fp32/tensor_computing_fp32.h" #include "tensor_transpose.h" -void softmax_lastAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, F32 *output) +template +static void softmax_lastAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, F32 *output) { for (I32 i = 0; i < loopOuter; i++) { const F32 *inputPtr = input + i * loops; F32 *outputPtr = output + i * loops; - __m256 max_v, sub_v, sum_v, tmp_v; + __m256 max_v, tmp_v; F32 max_s, tmp_s; - array_minmax_value_f32(inputPtr, loops, 2, &max_s); - max_v = _mm256_set1_ps(max_s); - sum_v = _mm256_set1_ps(0.f); - + if (!logsoftmax) { + array_minmax_value_f32(inputPtr, loops, 2, &max_s); + max_v = _mm256_set1_ps(max_s); + } I32 j = 0; - F32 sum_s = 0; - for (j = 0; j < loops - 7; j += 8) { + __m256 sum_v = _mm256_set1_ps(0.f); + for (; j < loops - 7; j += 8) { __m256 in = _mm256_loadu_ps(inputPtr + j); - sub_v = _mm256_sub_ps(in, max_v); - tmp_v = _mm256_exp_ps(sub_v); + if (!logsoftmax) { + in = _mm256_sub_ps(in, max_v); + } + tmp_v = _mm256_exp_ps(in); sum_v = _mm256_add_ps(sum_v, tmp_v); - _mm256_storeu_ps(outputPtr + j, tmp_v); + if (!logsoftmax) { + _mm256_storeu_ps(outputPtr + j, tmp_v); + } } - sum_s += _mm256_sum_ps(sum_v); + F32 sum_s = _mm256_sum_ps(sum_v); for (; j < loops; j++) { - tmp_s = exp(inputPtr[j] - max_s); - outputPtr[j] = tmp_s; + if (logsoftmax) { + tmp_s = exp(inputPtr[j]); + } else { + tmp_s = exp(inputPtr[j] - max_s); + outputPtr[j] = tmp_s; + } sum_s += tmp_s; } - array_scale_f32(outputPtr, outputPtr, loops, 1.0 / sum_s, 0); + if (logsoftmax) { + array_scale_f32(inputPtr, outputPtr, loops, 1.0, -log(sum_s)); + } else { + array_scale_f32(outputPtr, outputPtr, loops, 1.0 / sum_s, 0); + } } } +template void softmax_anyAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, I32 loopInner, F32 *output) { std::vector buffer(loopInner * 2); F32 *maxBuffer = &buffer[0]; F32 *sumBuffer = &buffer[loopInner]; I32 k = 0; + F32 tmp_s; for (I32 i = 0; i < loopOuter; i++) { const F32 *inputPtrBase = input + i * loops * loopInner; F32 *outputPtrBase = output + i * loops * loopInner; - memcpy(maxBuffer, inputPtrBase, loopInner * sizeof(F32)); - memset(sumBuffer, 0, loopInner * sizeof(F32)); - for (I32 j = 1; j < loops; j++) { - const F32 *inputPtr = inputPtrBase + j * loopInner; - for (k = 0; k < loopInner - 7; k += 8) { - __m256 in_v = _mm256_loadu_ps(inputPtr + k); - __m256 out_v = _mm256_loadu_ps(maxBuffer + k); - __m256 max_v = _mm256_max_ps(in_v, out_v); - _mm256_storeu_ps(maxBuffer + k, max_v); - } - for (; k < loopInner; k++) { - maxBuffer[k] = UNI_MAX(maxBuffer[k], inputPtr[k]); + UNI_MEMSET(sumBuffer, 0, loopInner * sizeof(F32)); + if (!logsoftmax) { + UNI_MEMCPY(maxBuffer, inputPtrBase, loopInner * sizeof(F32)); + for (I32 j = 1; j < loops; j++) { + const F32 *inputPtr = inputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 7; k += 8) { + __m256 in_v = _mm256_loadu_ps(inputPtr + k); + __m256 out_v = _mm256_loadu_ps(maxBuffer + k); + __m256 max_v = _mm256_max_ps(in_v, out_v); + _mm256_storeu_ps(maxBuffer + k, max_v); + } + for (; k < loopInner; k++) { + maxBuffer[k] = UNI_MAX(maxBuffer[k], inputPtr[k]); + } } } for (I32 j = 0; j < loops; j++) { @@ -74,35 +91,69 @@ void softmax_anyAxis_fp32(const F32 *input, I32 loopOuter, I32 loops, I32 loopIn F32 *outputPtr = outputPtrBase + j * loopInner; for (k = 0; k < loopInner - 7; k += 8) { __m256 in_v = _mm256_loadu_ps(inputPtr + k); - __m256 max_v = _mm256_loadu_ps(maxBuffer + k); - __m256 sub_v = _mm256_sub_ps(in_v, max_v); - __m256 exp_v = _mm256_exp_ps(sub_v); + if (!logsoftmax) { + in_v = _mm256_sub_ps(in_v, _mm256_loadu_ps(maxBuffer + k)); + } + __m256 exp_v = _mm256_exp_ps(in_v); __m256 sum_v = _mm256_loadu_ps(sumBuffer + k); sum_v = _mm256_add_ps(sum_v, exp_v); _mm256_storeu_ps(sumBuffer + k, sum_v); - _mm256_storeu_ps(outputPtr + k, exp_v); + if (!logsoftmax) { + _mm256_storeu_ps(outputPtr + k, exp_v); + } } for (; k < loopInner; k++) { - outputPtr[k] = exp(inputPtr[k] - maxBuffer[k]); - sumBuffer[k] += outputPtr[k]; + if (logsoftmax) { + tmp_s = exp(inputPtr[k]); + } else { + tmp_s = exp(inputPtr[k] - maxBuffer[k]); + outputPtr[k] = tmp_s; + } + sumBuffer[k] += tmp_s; } } - for (I32 j = 0; j < loops; j++) { - F32 *outputPtr = outputPtrBase + j * loopInner; + if (logsoftmax) { for (k = 0; k < loopInner - 7; k += 8) { - __m256 out_v = _mm256_loadu_ps(outputPtr + k); __m256 sum_v = _mm256_loadu_ps(sumBuffer + k); - out_v = _mm256_div_ps(out_v, sum_v); - _mm256_storeu_ps(outputPtr + k, out_v); + sum_v = _mm256_log_ps(sum_v); + _mm256_storeu_ps(sumBuffer + k, sum_v); } for (; k < loopInner; k++) { - outputPtr[k] /= sumBuffer[k]; + sumBuffer[k] = log(sumBuffer[k]); + } + for (I32 j = 0; j < loops; j++) { + const F32 *inputPtr = inputPtrBase + j * loopInner; + F32 *outputPtr = outputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 7; k += 8) { + __m256 out_v = _mm256_loadu_ps(inputPtr + k); + __m256 sum_v = _mm256_loadu_ps(sumBuffer + k); + out_v = _mm256_sub_ps(out_v, sum_v); + _mm256_storeu_ps(outputPtr + k, out_v); + } + for (; k < loopInner; k++) { + outputPtr[k] -= sumBuffer[k]; + } + } + } else { + for (I32 j = 0; j < loops; j++) { + F32 *outputPtr = outputPtrBase + j * loopInner; + for (k = 0; k < loopInner - 7; k += 8) { + __m256 out_v = _mm256_loadu_ps(outputPtr + k); + __m256 sum_v = _mm256_loadu_ps(sumBuffer + k); + out_v = _mm256_div_ps(out_v, sum_v); + _mm256_storeu_ps(outputPtr + k, out_v); + } + for (; k < loopInner; k++) { + outputPtr[k] /= sumBuffer[k]; + } } } } } -EE softmax_fp32(TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output) +template +static EE softmax_kernel( + TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output) { UNUSED(outputDesc); if (nullptr == input || nullptr == output) { @@ -146,9 +197,20 @@ EE softmax_fp32(TensorDesc inputDesc, const F32 *input, int axis, TensorDesc out } U32 loop_outer = size / loops / loop_inner; if (axis == 0) { - softmax_lastAxis_fp32(input, loop_outer, loops, output); + softmax_lastAxis_fp32(input, loop_outer, loops, output); } else { - softmax_anyAxis_fp32(input, loop_outer, loops, loop_inner, output); + softmax_anyAxis_fp32(input, loop_outer, loops, loop_inner, output); } return SUCCESS; } + +EE softmax_fp32(TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output) +{ + return softmax_kernel(inputDesc, input, axis, outputDesc, output); +} + +EE logsoftmax_fp32( + TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output) +{ + return softmax_kernel(inputDesc, input, axis, outputDesc, output); +} diff --git a/compute/tensor/src/cpu/x86/fp32/tensor_computing_fp32.h b/compute/tensor/src/cpu/x86/fp32/tensor_computing_fp32.h index 52018766..4101368f 100644 --- a/compute/tensor/src/cpu/x86/fp32/tensor_computing_fp32.h +++ b/compute/tensor/src/cpu/x86/fp32/tensor_computing_fp32.h @@ -78,6 +78,20 @@ EE convolution_direct(TensorDesc inputDesc, F32 *outArray, ActivationParamSpec activationDesc); +EE convolution_winograd(TensorDesc inputDesc, + F32 *inArray, + F32 *eltwiseInput, + TensorDesc filterDesc, + const F32 *filterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc biasDesc, + const F32 *biasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + F32 *outArray, + ActivationParamSpec activationDesc); + EE convolution_1x1_direct(TensorDesc inputDesc, F32 *inArray, F32 *eltwiseInput, @@ -110,7 +124,7 @@ EE check_fp32(TensorDesc inputDescA, const F32 *inputB, CheckMode checkMode, TensorDesc outputDesc, - I32 *output); + U8 *output); EE clip_fp32(F32 *input, F32 *output, I32 len, F32 minValue, F32 maxValue); @@ -223,8 +237,13 @@ EE eltwise_u8(std::vector input, void *output, EltwiseMode eltwiseMode); -EE layer_normalization_fp32( - TensorDesc inputDesc, F32 *input, F32 *alpha, F32 *beta, TensorDesc outputDesc, F32 *output); +EE layer_normalization_fp32(TensorDesc inputDesc, + F32 *input, + LayerNormParamSpec p, + F32 *alpha, + F32 *beta, + TensorDesc outputDesc, + F32 *output); EE l2normalization_fp32(TensorDesc inputDesc, const F32 *input, TensorDesc outputDesc, F32 *output); @@ -276,6 +295,12 @@ EE grucell_fp32(TensorDesc xDesc, void *output, Arch arch); +EE pooling_nchw_fp32(TensorDesc inputDesc, + const F32 *input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + F32 *output); + EE pooling_fp32(TensorDesc inputDesc, const F32 *input, PoolingParamSpec poolingParamSpec, @@ -305,6 +330,9 @@ EE scale_fp32(F32 *input, EE softmax_fp32( TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output); +EE logsoftmax_fp32( + TensorDesc inputDesc, const F32 *input, int axis, TensorDesc outputDesc, F32 *output); + EE deconvolution_transform_filter_fp32(TensorDesc filterDesc, const F32 *filter, ConvolutionForwardAlgorithm algorithm, diff --git a/compute/tensor/src/cpu/x86/fp32/transform_functions_fp32.h b/compute/tensor/src/cpu/x86/fp32/transform_functions_fp32.h index 213131d6..1a2db428 100644 --- a/compute/tensor/src/cpu/x86/fp32/transform_functions_fp32.h +++ b/compute/tensor/src/cpu/x86/fp32/transform_functions_fp32.h @@ -45,7 +45,7 @@ inline void transformNCHWCxNx(U32 fc, U32 fh, U32 fw, U32 oc, const F32 *input, _mm256_storeu_ps(dest + 24, _mm256_i32gather_ps(src + 24 * lstep, vindex, 4)); } } - memset(dest + N, 0, ((cSizePadding - cSize) * N * 4)); + UNI_MEMSET(dest + N, 0, ((cSizePadding - cSize) * N * 4)); } } } @@ -112,7 +112,7 @@ inline EE transformNCHWToNCHWCxNx( dest = output + c * fh * fw * 8 + hw * cSizePadding * 8 + c8 * 8; _mm256_storeu_ps(dest, _mm256_mask_i32gather_ps(src256, src, vindex, mask, 4)); } - memset(dest + 8, 0, ((cSizePadding - cSize) * 32)); + UNI_MEMSET(dest + 8, 0, ((cSizePadding - cSize) * 32)); } } fn += remain; @@ -128,10 +128,10 @@ inline void PaddingNCHWC8( DataFormat idf; U32 in, ic, ih, iw; CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 padih = paddingT + paddingB + ih; U32 padiw = paddingL + paddingR + iw; @@ -148,8 +148,9 @@ inline void PaddingNCHWC8( #endif for (U32 c = 0; c < ic; ++c) { U32 coff = c * padih * padiw * 8; - memset(tmp + coff, 0, padiw * paddingT * 8 * bytesOf(idt)); - memset(tmp + coff + (ih + paddingT) * padiw * 8, 0, padiw * paddingB * 8 * bytesOf(idt)); + UNI_MEMSET(tmp + coff, 0, padiw * paddingT * 8 * bytesOf(idt)); + UNI_MEMSET( + tmp + coff + (ih + paddingT) * padiw * 8, 0, padiw * paddingB * 8 * bytesOf(idt)); } #ifdef _USE_OPENMP @@ -161,10 +162,10 @@ inline void PaddingNCHWC8( U32 h = hc % ih; U32 hoff = (h + paddingT) * padiw; - memset(tmp + coff + hoff * 8, 0, paddingL * 8 * bytesOf(idt)); - memcpy(tmp + coff + (hoff + paddingL) * 8, data + c * ih * iw * 8 + h * iw * 8, + UNI_MEMSET(tmp + coff + hoff * 8, 0, paddingL * 8 * bytesOf(idt)); + UNI_MEMCPY(tmp + coff + (hoff + paddingL) * 8, data + c * ih * iw * 8 + h * iw * 8, iw * 8 * bytesOf(idt)); - memset(tmp + coff + (hoff + (paddingL + iw)) * 8, 0, paddingR * 8 * bytesOf(idt)); + UNI_MEMSET(tmp + coff + (hoff + (paddingL + iw)) * 8, 0, paddingR * 8 * bytesOf(idt)); } #ifdef _USE_OPENMP @@ -188,8 +189,8 @@ inline void deconvOverlapAndCrop(F32 *input, U32 fhfw = fh * fw; U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingL = convParamSpec.padding_left; + U32 paddingT = convParamSpec.pad_top; + U32 paddingL = convParamSpec.pad_left; __m256i vindex = _mm256_set_epi32(fhfw * 7, fhfw * 6, fhfw * 5, fhfw * 4, fhfw * 3, fhfw * 2, fhfw, 0); for (U32 kn = 0; kn < in; ++kn) { @@ -216,7 +217,7 @@ inline void deconvOverlapAndCrop(F32 *input, } } } - input += ic * ih * iw; + input += oc * fh * fw * ih * iw; output += oc * oh * ow; } } @@ -237,8 +238,8 @@ inline void deconvOverlapAndCropNCHWC8(F32 *input, U32 fhfw = fh * fw; U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingL = convParamSpec.padding_left; + U32 paddingT = convParamSpec.pad_top; + U32 paddingL = convParamSpec.pad_left; for (U32 kn = 0; kn < in; ++kn) { for (U32 kh = 0; kh < ih; ++kh) { for (U32 kw = 0; kw < iw; ++kw) { @@ -263,7 +264,7 @@ inline void deconvOverlapAndCropNCHWC8(F32 *input, } } } - input += ic * ih * iw; + input += oc * fh * fw * ih * iw; output += oc * oh * ow; } } @@ -285,8 +286,8 @@ inline void deconvOverlapAndCropEqualNCHWC8(F32 *input, U32 fhfw = fh * fw; U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingL = convParamSpec.padding_left; + U32 paddingT = convParamSpec.pad_top; + U32 paddingL = convParamSpec.pad_left; for (U32 kn = 0; kn < in; ++kn) { for (U32 kc = 0; kc < oc; kc += 8) { #ifdef _USE_OPENMP @@ -312,7 +313,7 @@ inline void deconvOverlapAndCropEqualNCHWC8(F32 *input, } } } - input += ic * ih * iw; + input += oc * fh * fw * ih * iw; output += oc * oh * ow; } } diff --git a/compute/tensor/src/cpu/x86/fp32/x86_functions_fp32.h b/compute/tensor/src/cpu/x86/fp32/x86_functions_fp32.h index 64aaeca9..963218ea 100644 --- a/compute/tensor/src/cpu/x86/fp32/x86_functions_fp32.h +++ b/compute/tensor/src/cpu/x86/fp32/x86_functions_fp32.h @@ -13,147 +13,101 @@ #ifndef CHEETAH_X86_FUNCTIONS_FP32_H #define CHEETAH_X86_FUNCTIONS_FP32_H -#include + +#include "cpu/cpu_functions_template.h" #include "x86_avx2_expand.h" -#include "parameter_spec.h" -#include "uni.h" #include "thread_affinity.h" inline EE activation_fp32(F32 *input, U32 len, ActivationParamSpec activationDesc, F32 *output) { - __m256 in, out; __m256 zero = _mm256_set1_ps(0.); __m256 one = _mm256_set1_ps(1.); __m256 three = _mm256_set1_ps(3.); __m256 six = _mm256_set1_ps(6.); __m256 signm = _mm256_set1_ps(-0.0); - U32 len_main = len / 8; - U32 len_tail = len % 8; - - F32 value; + U32 loops = len / 8 * 8; EE ret = SUCCESS; - switch (activationDesc.mode) { case ACTIVATION_NULL: { + if (output != input) { + UNI_MEMCPY(output, input, sizeof(float) * len); + } + loops = len; break; } case ACTIVATION_RELU: { - U32 main_len = len - len_tail; if (activationDesc.value[0] == 0) { #ifdef _USE_OPENMP #pragma omp parallel for num_threads(OMP_NUM_THREADS) schedule(static) #endif - for (U32 i = 0; i < len_main; i++) { - _mm256_storeu_ps( - output + i * 8, _mm256_max_ps(zero, _mm256_loadu_ps(input + i * 8))); - } - for (U32 i = 0; i < len_tail; i++) { - output[main_len + i] = (input[main_len + i] < 0) ? 0 : input[main_len + i]; + for (U32 i = 0; i < loops; i += 8) { + _mm256_storeu_ps(output + i, _mm256_max_ps(zero, _mm256_loadu_ps(input + i))); } } else { __m256 scale = _mm256_set1_ps(activationDesc.value[0]); #ifdef _USE_OPENMP #pragma omp parallel for num_threads(OMP_NUM_THREADS) schedule(static) #endif - for (U32 i = 0; i < len_main; i++) { - __m256 tmp = _mm256_loadu_ps(input + i * 8); - _mm256_storeu_ps(output + i * 8, _mm256_max_ps(_mm256_mul_ps(scale, tmp), tmp)); - } - for (U32 i = 0; i < len_tail; i++) { - float tmp = activationDesc.value[0] * input[main_len + i]; - output[main_len + i] = (input[main_len + i] < tmp) ? tmp : input[main_len + i]; + for (U32 i = 0; i < loops; i += 8) { + __m256 tmp = _mm256_loadu_ps(input + i); + _mm256_storeu_ps(output + i, _mm256_max_ps(_mm256_mul_ps(scale, tmp), tmp)); } } break; } case ACTIVATION_RELU6: { - for (U32 i = 0; i < len_main; i++) { - in = _mm256_loadu_ps(input); - out = _mm256_max_ps(zero, in); + for (U32 i = 0; i < loops; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 out = _mm256_max_ps(zero, in); out = _mm256_min_ps(six, out); - _mm256_storeu_ps(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = (input[i] < 0) ? 0 : input[i]; - if (value > 6) { - value = 6; - } - output[i] = value; + _mm256_storeu_ps(output + i, out); } break; } case ACTIVATION_H_SIGMOID: { - for (U32 i = 0; i < len_main; i++) { - in = _mm256_loadu_ps(input); - out = _mm256_add_ps(in, three); + for (U32 i = 0; i < loops; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 out = _mm256_add_ps(in, three); out = _mm256_max_ps(out, zero); out = _mm256_min_ps(out, six); out = _mm256_div_ps(out, six); - _mm256_storeu_ps(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i] + 3; - value = (value < 0) ? 0 : value; - value = (value > 6) ? 6 : value; - value = value / 6; - output[i] = value; + _mm256_storeu_ps(output + i, out); } break; } case ACTIVATION_H_SWISH: { - for (U32 i = 0; i < len_main; i++) { - in = _mm256_loadu_ps(input); - out = _mm256_add_ps(in, three); +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) schedule(static) +#endif + for (U32 i = 0; i < loops; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 out = _mm256_add_ps(in, three); out = _mm256_max_ps(out, zero); out = _mm256_min_ps(out, six); out = _mm256_div_ps(out, six); out = _mm256_mul_ps(out, in); - _mm256_storeu_ps(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i] + 3; - value = (value < 0) ? 0 : value; - value = (value > 6) ? 6 : value; - value = input[i] * value; - value = value / 6; - output[i] = value; + _mm256_storeu_ps(output + i, out); } break; } case ACTIVATION_H_SWISH_NODIV: { - for (U32 i = 0; i < len_main; i++) { - in = _mm256_loadu_ps(input); - out = _mm256_add_ps(in, three); + for (U32 i = 0; i < loops; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 out = _mm256_add_ps(in, three); out = _mm256_max_ps(out, zero); out = _mm256_min_ps(out, six); out = _mm256_mul_ps(out, in); - _mm256_storeu_ps(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i] + 3; - value = (value < 0) ? 0 : value; - value = (value > 6) ? 6 : value; - value = input[i] * value; - output[i] = value; + _mm256_storeu_ps(output + i, out); } break; } case ACTIVATION_GELU: { - F32 two_div_PI_sqrt = sqrt(2 / 3.14159265358979323846); - __m256 vec0 = _mm256_set1_ps(two_div_PI_sqrt); + __m256 vec0 = _mm256_set1_ps(sqrt(2 / 3.14159265358979323846)); __m256 vec1 = _mm256_set1_ps(0.044715); __m256 vec2 = _mm256_set1_ps(0.5); - for (U32 i = 0; i < len_main; i++) { - in = _mm256_loadu_ps(input); - out = _mm256_mul_ps(in, in); + for (U32 i = 0; i < loops; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 out = _mm256_mul_ps(in, in); out = _mm256_mul_ps(out, in); out = _mm256_fmadd_ps(vec1, out, in); out = _mm256_mul_ps(vec0, out); @@ -161,136 +115,126 @@ inline EE activation_fp32(F32 *input, U32 len, ActivationParamSpec activationDes out = _mm256_add_ps(one, out); out = _mm256_mul_ps(vec2, out); out = _mm256_mul_ps(in, out); - _mm256_storeu_ps(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i]; - value = two_div_PI_sqrt * (value + 0.044715 * powf(value, 3)); - value = 1.0 - 2.0 / (exp(2.0 * value) + 1.0); - value = 0.5 * (1.0 + value); - value = input[i] * value; - output[i] = value; + _mm256_storeu_ps(output + i, out); } break; } case ACTIVATION_TANH: { - for (U32 i = 0; i < len_main; i++) { - in = _mm256_loadu_ps(input); - out = _mm256_tanh_ps(in); - _mm256_storeu_ps(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = 1.0 - 2.0 / (exp(2.0 * input[i]) + 1.0); - output[i] = value; + for (U32 i = 0; i < loops; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 out = _mm256_tanh_ps(in); + _mm256_storeu_ps(output + i, out); } break; } case ACTIVATION_SIGMOID: { - for (U32 i = 0; i < len_main; i++) { - in = _mm256_loadu_ps(input); - out = _mm256_sigmod_ps(in); - _mm256_storeu_ps(output, out); - input += 8; - output += 8; + for (U32 i = 0; i < loops; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 out = _mm256_sigmod_ps(in); + _mm256_storeu_ps(output + i, out); } - for (U32 i = 0; i < len_tail; i++) { - value = 1.0 / (1.0 + exp(-1.0 * input[i])); - output[i] = value; + break; + } + case ACTIVATION_SWISH: { +#ifdef _USE_OPENMP +#pragma omp parallel for num_threads(OMP_NUM_THREADS) schedule(static) +#endif + for (U32 i = 0; i < loops; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 out = _mm256_mul_ps(in, _mm256_sigmod_ps(in)); + _mm256_storeu_ps(output + i, out); } break; } case ACTIVATION_MISH: { - for (U32 i = 0; i < len_main; i++) { - in = _mm256_loadu_ps(input); - out = _mm256_mul_ps( + for (U32 i = 0; i < loops; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 out = _mm256_mul_ps( in, _mm256_tanh_ps(_mm256_log_ps(_mm256_add_ps(_mm256_exp_ps(in), one)))); - _mm256_storeu_ps(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - value = input[i] * tanh(log(exp(input[i]) + 1.0)); - output[i] = value; + _mm256_storeu_ps(output + i, out); } break; } case ACTIVATION_SOFTPLUS: { - for (U32 i = 0; i < len_main; i++) { - in = _mm256_loadu_ps(input); - out = _mm256_log_ps(_mm256_add_ps(_mm256_exp_ps(in), one)); - _mm256_storeu_ps(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - output[i] = log(1 + exp(input[i])); + for (U32 i = 0; i < loops; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 out = _mm256_log_ps(_mm256_add_ps(_mm256_exp_ps(in), one)); + _mm256_storeu_ps(output + i, out); } break; } case ACTIVATION_EXP: { - for (U32 i = 0; i < len_main; i++) { - in = _mm256_loadu_ps(input); - out = _mm256_exp_ps(in); - _mm256_storeu_ps(output, out); - input += 8; - output += 8; - } - for (U32 i = 0; i < len_tail; i++) { - output[i] = exp(input[i]); + for (U32 i = 0; i < loops; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 out = _mm256_exp_ps(in); + _mm256_storeu_ps(output + i, out); } break; } case ACTIVATION_ABS: { - for (U32 i = 0; i < len_main; i++) { - in = _mm256_loadu_ps(input); - out = _mm256_andnot_ps(signm, in); - _mm256_storeu_ps(output, out); - input += 8; - output += 8; + for (U32 i = 0; i < loops; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 out = _mm256_andnot_ps(signm, in); + _mm256_storeu_ps(output + i, out); } - for (U32 i = 0; i < len_tail; i++) { - output[i] = UNI_ABS(input[i]); + break; + } + case ACTIVATION_LOG: { + for (U32 i = 0; i < loops; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 out = _mm256_log_ps(in); + _mm256_storeu_ps(output + i, out); } break; } - case ACTIVATION_SIGN: { - for (U32 i = 0; i < len; i++) { - output[i] = UNI_SIGN(input[i]); + case ACTIVATION_ROUND: { + for (U32 i = 0; i < loops; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 out = _mm256_round_ps(in, _MM_FROUND_TO_NEAREST_INT); + _mm256_storeu_ps(output + i, out); } break; } - case ACTIVATION_LOG: { - for (U32 i = 0; i < len; i++) { - output[i] = log(input[i]); + case ACTIVATION_CEIL: { + for (U32 i = 0; i < loops; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 out = _mm256_ceil_ps(in); + _mm256_storeu_ps(output + i, out); } break; } - case ACTIVATION_NOT: { - for (U32 i = 0; i < len; i++) { - output[i] = (input[i] > 0) ? 0 : 1; + case ACTIVATION_FLOOR: { + for (U32 i = 0; i < loops; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 out = _mm256_floor_ps(in); + _mm256_storeu_ps(output + i, out); } break; } - case ACTIVATION_GREATER: { - for (U32 i = 0; i < len; i++) { - output[i] = input[i] > 1 ? 1 : 0; + case ACTIVATION_RECIPROCAL: { + for (U32 i = 0; i < loops; i += 8) { + __m256 in = _mm256_loadu_ps(input + i); + __m256 out = _mm256_div_ps(one, in); + _mm256_storeu_ps(output + i, out); } break; } + case ACTIVATION_SIGN: + case ACTIVATION_NOT: + case ACTIVATION_GREATER: case ACTIVATION_NEG: { - for (U32 i = 0; i < len; i++) { - output[i] = -input[i]; - } + loops = 0; break; } default: ret = NOT_SUPPORTED; break; } + if (ret == SUCCESS) { + for (U32 i = loops; i < len; i++) { + ret = activation_template(activationDesc, input[i], output + i); + } + } return ret; } @@ -334,7 +278,7 @@ inline void array_power_f32(F32 *input, F32 *output, I32 len, F32 power) } } else if (power == 1) { if (input != output) { - memcpy(output, input, len * sizeof(F32)); + UNI_MEMCPY(output, input, len * sizeof(F32)); } i = len; } else if (power == 2) { @@ -478,8 +422,7 @@ inline F32 array_var_f32(const F32 *data, I32 len, F32 mean) sum_s += _mm256_sum_ps(sum_v); } for (; i < len; i++) { - F32 in = data[i]; - F32 tmp = in - mean; + F32 tmp = data[i] - mean; sum_s += tmp * tmp; } return sum_s / len; @@ -506,6 +449,26 @@ inline F32 array_sum_f32(const F32 *data, I32 len) return sum_s; } +inline I32 array_sum_i32(const I32 *data, I32 len) +{ + if (len <= 0) { + return 0; + } + + I32 i = 0; + I32 sum_s = 0; + __m256i sum_v = _mm256_set1_epi32(0); + for (i = 0; i < len - 7; i += 8) { + __m256i in = _mm256_loadu_si256((const __m256i *)(data + i)); + sum_v = _mm256_add_epi32(sum_v, in); + } + sum_s += _mm256_sum_epi32(sum_v); + for (; i < len; i++) { + sum_s += data[i]; + } + return sum_s; +} + // array mean inline F32 array_mean_f32(const F32 *data, I32 len) { diff --git a/compute/tensor/src/cpu/x86/int32/scale.cpp b/compute/tensor/src/cpu/x86/int32/scale.cpp new file mode 100644 index 00000000..0d0bafd5 --- /dev/null +++ b/compute/tensor/src/cpu/x86/int32/scale.cpp @@ -0,0 +1,151 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/int32/tensor_computing_int32.h" + +static EE scale_nchwc8_int32( + I32 *input, I32 *alpha, I32 *beta, I32 in, I32 ic, I32 elements_per_channel, I32 *output) +{ + __m256i in_vec, out_vec; + __m256i one = _mm256_set1_epi32(1); + __m256i zero = _mm256_set1_epi32(0); + U32 index = 0; + for (I32 n = 0; n < in; n++) { + for (I32 c = 0; c < ic; c += 8) { + __m256i alpha_vec = (alpha == nullptr) ? one : _mm256_loadu_si256((const __m256i *)(alpha + c)); + __m256i beta_vec = (beta == nullptr) ? zero : _mm256_loadu_si256((const __m256i *)(beta + c)); + for (I32 i = 0; i < elements_per_channel; i++) { + in_vec = _mm256_loadu_si256((const __m256i *)(input + index)); + out_vec = _mm256_add_epi32(_mm256_mul_epi32(alpha_vec, in_vec), beta_vec); + _mm256_storeu_si256((__m256i *)(output + index), out_vec); + index += 8; + } + } + } + return SUCCESS; +} + +template +static EE scale_nchw_int32( + I32 *input, I32 *alpha, I32 *beta, I32 in, I32 ic, I32 elements_per_channel, I32 *output) +{ + __m256i one = _mm256_set1_epi32(1); + __m256i zero = _mm256_set1_epi32(0); + U32 dst = 0, src = 0; + for (I32 n = 0; n < in; n++) { + for (I32 c = 0; c < ic; c++) { + __m256i alpha_vec = (alpha == nullptr) ? one : _mm256_set1_epi32(alpha[c]); + __m256i beta_vec = (beta == nullptr) ? zero : _mm256_set1_epi32(beta[c]); + I32 i = 0; + for (; i < elements_per_channel - 7; i += 8) { + if (icoc_equal) { + src = (n * ic + c) * elements_per_channel + i; + } else { + src = n * elements_per_channel + i; + } + __m256i in_vec = _mm256_loadu_si256((const __m256i *)(input + src)); + __m256i out_vec = _mm256_add_epi32(_mm256_mul_epi32(alpha_vec, in_vec), beta_vec); + _mm256_storeu_si256((__m256i *)(output + dst), out_vec); + dst += 8; + } + for (; i < elements_per_channel; i++) { + if (icoc_equal) { + src = (n * ic + c) * elements_per_channel + i; + } else { + src = n * elements_per_channel + i; + } + int alpha_s = (alpha == nullptr) ? 1 : alpha[c]; + int beta_s = (beta == nullptr) ? 0 : beta[c]; + output[dst] = alpha_s * input[src] + beta_s; + dst++; + } + } + } + return SUCCESS; +} + +template +static EE scale_nhwc_int32( + I32 *input, I32 *alpha, I32 *beta, I32 in, I32 ic, I32 elements_per_channel, I32 *output) +{ + __m256i one = _mm256_set1_epi32(1); + __m256i zero = _mm256_set1_epi32(0); + __m256i in_vec; + int in_s; + U32 dst = 0, src = 0; + for (I32 n = 0; n < in; n++) { + for (I32 i = 0; i < elements_per_channel; i++, src++) { + I32 c = 0; + for (; c < ic - 7; c += 8) { + __m256i alpha_vec = (alpha == nullptr) ? one : _mm256_loadu_si256((const __m256i *)(alpha + c)); + __m256i beta_vec = (beta == nullptr) ? zero : _mm256_loadu_si256((const __m256i *)(beta + c)); + if (icoc_equal) { + in_vec = _mm256_loadu_si256((const __m256i *)(input + dst)); + } else { + in_vec = _mm256_set1_epi32(input[src]); + } + __m256i out_vec = _mm256_add_epi32(_mm256_mul_epi32(alpha_vec, in_vec), beta_vec); + _mm256_storeu_si256((__m256i *)(output + dst), out_vec); + dst += 8; + } + for (; c < ic; c++) { + int alpha_s = (alpha == nullptr) ? 1 : alpha[c]; + int beta_s = (beta == nullptr) ? 0 : beta[c]; + if (icoc_equal) { + in_s = input[dst]; + } else { + in_s = input[src]; + } + output[dst] = alpha_s * in_s + beta_s; + dst++; + } + } + } + return SUCCESS; +} + +EE scale_int32(I32 *input, + I32 axis, + I32 nDims, + I32 *alpha, + I32 *beta, + I32 on, + I32 oc, + I32 elements_per_channel, + I32 ic, + I32 *output) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + EE ret = SUCCESS; + // If oc is 1, it means that weights/vectors have only one param, so we need use the calculation logic of nchw. + if (axis == 1 || axis == 0 || oc == 1) { + if (ic == oc) { + ret = scale_nchw_int32(input, alpha, beta, on, oc, elements_per_channel, output); + } else { + ret = scale_nchw_int32(input, alpha, beta, on, oc, elements_per_channel, output); + } + } else if (axis == nDims - 1) { + if (ic == oc) { + ret = scale_nhwc_int32(input, alpha, beta, on, oc, elements_per_channel, output); + } else { + ret = scale_nhwc_int32(input, alpha, beta, on, oc, elements_per_channel, output); + } + } else if (axis == nDims) { + ret = scale_nchwc8_int32(input, alpha, beta, on, oc, elements_per_channel, output); + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/int32/tensor_computing_int32.h b/compute/tensor/src/cpu/x86/int32/tensor_computing_int32.h new file mode 100644 index 00000000..b52434ba --- /dev/null +++ b/compute/tensor/src/cpu/x86/int32/tensor_computing_int32.h @@ -0,0 +1,31 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef CHEETAH_TENSOR_COMPUTING_INT32_H +#define CHEETAH_TENSOR_COMPUTING_INT32_H + +#include "error.h" + +#include "thread_affinity.h" + +EE scale_int32(I32 *input, + I32 axis, + I32 nDims, + I32 *alpha, + I32 *beta, + I32 on, + I32 oc, + I32 elements_per_channel, + I32 ic, + I32 *output); +#endif diff --git a/compute/tensor/src/cpu/x86/int8/convolution.cpp b/compute/tensor/src/cpu/x86/int8/convolution.cpp index 277593d3..1581d5a1 100644 --- a/compute/tensor/src/cpu/x86/int8/convolution.cpp +++ b/compute/tensor/src/cpu/x86/int8/convolution.cpp @@ -12,8 +12,6 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "sys.h" -#include "error.h" - #include "cpu/x86/int8/tensor_computing_int8.h" EE convolution_infer_forward_tmp_bytes_int8(TensorDesc inputDesc, @@ -34,10 +32,10 @@ EE convolution_infer_forward_tmp_bytes_int8(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 ih_pad = ih + paddingT + paddingB; U32 iw_pad = iw + paddingL + paddingR; @@ -62,6 +60,9 @@ EE convolution_infer_forward_tmp_bytes_int8(TensorDesc inputDesc, if (idf != DF_NCHWC16) { *bytes += icPadding * ih_pad * iw_pad; } + if (paddingT > 1 || paddingB > 1 || paddingL > 1 || paddingR > 1) { + *bytes += oc * 4; + } break; } case CONVOLUTION_ALGORITHM_GEMM_ICNCHW: @@ -95,12 +96,13 @@ EE convolution_infer_forward_tmp_bytes_int8(TensorDesc inputDesc, EE convolution_int8(TensorDesc inputDesc, UINT8 *input, + F32 *eltwiseInput, TensorDesc filterDesc, const INT8 *filter, ConvolutionParamSpec convParamSpec, ConvolutionForwardAlgorithm algorithm, TensorDesc biasDesc, - const I32 *bias, + const F32 *bias, U32 tmpBytes, void *tmp, TensorDesc outputDesc, @@ -132,11 +134,11 @@ EE convolution_int8(TensorDesc inputDesc, EE ret = SUCCESS; switch (algorithm) { case CONVOLUTION_ALGORITHM_DIRECT: - ret = convolution_direct(inputDesc, input, filterDesc, filter, convParamSpec, biasDesc, + ret = convolution_direct(inputDesc, input, eltwiseInput, filterDesc, filter, convParamSpec, biasDesc, bias, tmpBytes, tmp, outputDesc, output, scale, activationDesc); break; case CONVOLUTION_ALGORITHM_POINTWISE: - ret = convolution_1x1_direct(inputDesc, input, filterDesc, filter, convParamSpec, + ret = convolution_1x1_direct(inputDesc, input, eltwiseInput, filterDesc, filter, convParamSpec, biasDesc, bias, tmpBytes, tmp, outputDesc, output, scale, activationDesc); break; default: diff --git a/compute/tensor/src/cpu/x86/int8/convolution_1x1_direct.cpp b/compute/tensor/src/cpu/x86/int8/convolution_1x1_direct.cpp index a55a25a8..3dc729eb 100644 --- a/compute/tensor/src/cpu/x86/int8/convolution_1x1_direct.cpp +++ b/compute/tensor/src/cpu/x86/int8/convolution_1x1_direct.cpp @@ -17,1919 +17,2215 @@ #include "error.h" #include "transform_functions_int8.h" #include "cpu/x86/int8/tensor_computing_int8.h" +#include "cpu/x86/int8/convolution_functions.h" #include "cpu/x86/tensor_computing_x86.h" #define SIMDW 16 #define BLOCK_IC_DIM 256 #define BLOCK_HW_DIM 768 -struct ConvController { - UINT8 *input; - const INT8 *filter; - void *output; - UINT8 *u8Output; - const I32 *bias; - I64 ic; - I64 kw; - I64 kh; - I64 stepC16; - I64 dilateW; - I64 dilateH; - I64 ostepC16; - I64 flags; - I64 fStep; - I64 f8Step; - I64 f4Step; - void *scale; -}; - -typedef void (*kernelFunc)(ConvController &c); - // clang-format off -#define clear1Regs(rtype) \ - "vxorps "#rtype"0, "#rtype"0, "#rtype"0 \n\t" - -#define clear2Regs(rtype) \ - clear1Regs(rtype) \ - "vxorps "#rtype"1, "#rtype"1, "#rtype"1 \n\t" - -#define clear3Regs(rtype) \ - clear2Regs(rtype) \ - "vxorps "#rtype"2, "#rtype"2, "#rtype"2 \n\t" - -#define clear12Regs(rtype) \ - clear3Regs(rtype) \ - "vxorps "#rtype"3, "#rtype"3, "#rtype"3 \n\t" \ - "vxorps "#rtype"4, "#rtype"4, "#rtype"4 \n\t" \ - "vxorps "#rtype"5, "#rtype"5, "#rtype"5 \n\t" \ - "vxorps "#rtype"6, "#rtype"6, "#rtype"6 \n\t" \ - "vxorps "#rtype"7, "#rtype"7, "#rtype"7 \n\t" \ - "vxorps "#rtype"8, "#rtype"8, "#rtype"8 \n\t" \ - "vxorps "#rtype"9, "#rtype"9, "#rtype"9 \n\t" \ - "vxorps "#rtype"10, "#rtype"10, "#rtype"10 \n\t" \ - "vxorps "#rtype"11, "#rtype"11, "#rtype"11 \n\t" - -#define clear24Regs(rtype) \ - clear12Regs(rtype) \ - "vxorps "#rtype"12, "#rtype"12, "#rtype"12 \n\t" \ - "vxorps "#rtype"13, "#rtype"13, "#rtype"13 \n\t" \ - "vxorps "#rtype"14, "#rtype"14, "#rtype"14 \n\t" \ - "vxorps "#rtype"15, "#rtype"15, "#rtype"15 \n\t" \ - "vxorps "#rtype"16, "#rtype"16, "#rtype"16 \n\t" \ - "vxorps "#rtype"17, "#rtype"17, "#rtype"17 \n\t" \ - "vxorps "#rtype"18, "#rtype"18, "#rtype"18 \n\t" \ - "vxorps "#rtype"19, "#rtype"19, "#rtype"19 \n\t" \ - "vxorps "#rtype"20, "#rtype"20, "#rtype"20 \n\t" \ - "vxorps "#rtype"21, "#rtype"21, "#rtype"21 \n\t" \ - "vxorps "#rtype"22, "#rtype"22, "#rtype"22 \n\t" \ - "vxorps "#rtype"23, "#rtype"23, "#rtype"23 \n\t" - -#define reluReg(rtype) \ - "vpxord "#rtype"31, "#rtype"31, "#rtype"31 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"0, "#rtype"0 \n\t" - -#define relu2Regs(rtype) \ - reluReg(rtype) \ - "vpmaxsd "#rtype"31, "#rtype"1, "#rtype"1 \n\t" - -#define relu3Regs(rtype) \ - relu2Regs(rtype) \ - "vpmaxsd "#rtype"31, "#rtype"2, "#rtype"2 \n\t" - -#define relu12Regs(rtype) \ - relu3Regs(rtype) \ - "vpmaxsd "#rtype"31, "#rtype"3, "#rtype"3 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"4, "#rtype"4 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"5, "#rtype"5 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"6, "#rtype"6 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"7, "#rtype"7 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"8, "#rtype"8 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"9, "#rtype"9 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"10, "#rtype"10 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"11, "#rtype"11 \n\t" - -#define relu24Regs(rtype) \ - relu12Regs(rtype) \ - "vpmaxsd "#rtype"31, "#rtype"12, "#rtype"12 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"13, "#rtype"13 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"14, "#rtype"14 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"15, "#rtype"15 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"16, "#rtype"16 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"17, "#rtype"17 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"18, "#rtype"18 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"19, "#rtype"19 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"20, "#rtype"20 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"21, "#rtype"21 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"22, "#rtype"22 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"23, "#rtype"23 \n\t" - -#define convertRegI32ToF32(scalePtr, rtype) \ - "vbroadcastss ("#scalePtr"), "#rtype"24 \n\t" \ - "vcvtdq2ps "#rtype"0, "#rtype"0 \n\t" \ - "vmulps "#rtype"0, "#rtype"24, "#rtype"0 \n\t" \ - -#define convert2RegsI32ToF32(scalePtr, rtype) \ - "vbroadcastss ("#scalePtr"), "#rtype"24 \n\t" \ - "vcvtdq2ps "#rtype"0, "#rtype"0 \n\t" \ - "vcvtdq2ps "#rtype"1, "#rtype"1 \n\t" \ - "vmulps "#rtype"0, "#rtype"24, "#rtype"0 \n\t" \ - "vmulps "#rtype"1, "#rtype"24, "#rtype"1 \n\t" \ - -#define convert3RegsI32ToF32(scalePtr, rtype) \ - "vbroadcastss ("#scalePtr"), "#rtype"24 \n\t" \ - "vcvtdq2ps "#rtype"0, "#rtype"0 \n\t" \ - "vcvtdq2ps "#rtype"1, "#rtype"1 \n\t" \ - "vcvtdq2ps "#rtype"2, "#rtype"2 \n\t" \ - "vmulps "#rtype"0, "#rtype"24, "#rtype"0 \n\t" \ - "vmulps "#rtype"1, "#rtype"24, "#rtype"1 \n\t" \ - "vmulps "#rtype"2, "#rtype"24, "#rtype"2 \n\t" -#define convert12RegsI32ToF32(scalePtr, rtype) \ - "vbroadcastss ("#scalePtr"), "#rtype"24 \n\t" \ - "vcvtdq2ps "#rtype"0, "#rtype"0 \n\t" \ - "vcvtdq2ps "#rtype"1, "#rtype"1 \n\t" \ - "vcvtdq2ps "#rtype"2, "#rtype"2 \n\t" \ - "vcvtdq2ps "#rtype"3, "#rtype"3 \n\t" \ - "vcvtdq2ps "#rtype"4, "#rtype"4 \n\t" \ - "vcvtdq2ps "#rtype"5, "#rtype"5 \n\t" \ - "vcvtdq2ps "#rtype"6, "#rtype"6 \n\t" \ - "vcvtdq2ps "#rtype"7, "#rtype"7 \n\t" \ - "vcvtdq2ps "#rtype"8, "#rtype"8 \n\t" \ - "vcvtdq2ps "#rtype"9, "#rtype"9 \n\t" \ - "vcvtdq2ps "#rtype"10, "#rtype"10 \n\t" \ - "vcvtdq2ps "#rtype"11, "#rtype"11 \n\t" \ - "vmulps "#rtype"0, "#rtype"24, "#rtype"0 \n\t" \ - "vmulps "#rtype"1, "#rtype"24, "#rtype"1 \n\t" \ - "vmulps "#rtype"2, "#rtype"24, "#rtype"2 \n\t" \ - "vmulps "#rtype"3, "#rtype"24, "#rtype"3 \n\t" \ - "vmulps "#rtype"4, "#rtype"24, "#rtype"4 \n\t" \ - "vmulps "#rtype"5, "#rtype"24, "#rtype"5 \n\t" \ - "vmulps "#rtype"6, "#rtype"24, "#rtype"6 \n\t" \ - "vmulps "#rtype"7, "#rtype"24, "#rtype"7 \n\t" \ - "vmulps "#rtype"8, "#rtype"24, "#rtype"8 \n\t" \ - "vmulps "#rtype"9, "#rtype"24, "#rtype"9 \n\t" \ - "vmulps "#rtype"10, "#rtype"24, "#rtype"10 \n\t" \ - "vmulps "#rtype"11, "#rtype"24, "#rtype"11 \n\t" - -#define convert24RegsI32ToF32(scalePtr, rtype) \ - convert12RegsI32ToF32(scalePtr, rtype) \ - "vcvtdq2ps "#rtype"12, "#rtype"12 \n\t" \ - "vcvtdq2ps "#rtype"13, "#rtype"13 \n\t" \ - "vcvtdq2ps "#rtype"14, "#rtype"14 \n\t" \ - "vcvtdq2ps "#rtype"15, "#rtype"15 \n\t" \ - "vcvtdq2ps "#rtype"16, "#rtype"16 \n\t" \ - "vcvtdq2ps "#rtype"17, "#rtype"17 \n\t" \ - "vcvtdq2ps "#rtype"18, "#rtype"18 \n\t" \ - "vcvtdq2ps "#rtype"19, "#rtype"19 \n\t" \ - "vcvtdq2ps "#rtype"20, "#rtype"20 \n\t" \ - "vcvtdq2ps "#rtype"21, "#rtype"21 \n\t" \ - "vcvtdq2ps "#rtype"22, "#rtype"22 \n\t" \ - "vcvtdq2ps "#rtype"23, "#rtype"23 \n\t" \ - "vmulps "#rtype"12, "#rtype"24, "#rtype"12 \n\t" \ - "vmulps "#rtype"13, "#rtype"24, "#rtype"13 \n\t" \ - "vmulps "#rtype"14, "#rtype"24, "#rtype"14 \n\t" \ - "vmulps "#rtype"15, "#rtype"24, "#rtype"15 \n\t" \ - "vmulps "#rtype"16, "#rtype"24, "#rtype"16 \n\t" \ - "vmulps "#rtype"17, "#rtype"24, "#rtype"17 \n\t" \ - "vmulps "#rtype"18, "#rtype"24, "#rtype"18 \n\t" \ - "vmulps "#rtype"19, "#rtype"24, "#rtype"19 \n\t" \ - "vmulps "#rtype"20, "#rtype"24, "#rtype"20 \n\t" \ - "vmulps "#rtype"21, "#rtype"24, "#rtype"21 \n\t" \ - "vmulps "#rtype"22, "#rtype"24, "#rtype"22 \n\t" \ - "vmulps "#rtype"23, "#rtype"24, "#rtype"23 \n\t" -#define load48BiasTo3Regs(bias) \ - "vmovups ("#bias"), %%zmm0 \n\t" \ - "vmovups 0x40("#bias"), %%zmm1 \n\t" \ - "vmovups 0x80("#bias"), %%zmm2 \n\t" \ - -#define load48BiasTo12Regs(bias) \ - load48BiasTo3Regs(bias) \ - "vmovups %%zmm0, %%zmm3 \n\t" \ - "vmovups %%zmm1, %%zmm4 \n\t" \ - "vmovups %%zmm2, %%zmm5 \n\t" \ - "vmovups %%zmm0, %%zmm6 \n\t" \ - "vmovups %%zmm1, %%zmm7 \n\t" \ - "vmovups %%zmm2, %%zmm8 \n\t" \ - "vmovups %%zmm0, %%zmm9 \n\t" \ - "vmovups %%zmm1, %%zmm10 \n\t" \ - "vmovups %%zmm2, %%zmm11 \n\t" - -#define load48BiasTo24Regs(bias) \ - load48BiasTo12Regs(bias) \ - "vmovups %%zmm0, %%zmm12 \n\t" \ - "vmovups %%zmm1, %%zmm13 \n\t" \ - "vmovups %%zmm2, %%zmm14 \n\t" \ - "vmovups %%zmm0, %%zmm15 \n\t" \ - "vmovups %%zmm1, %%zmm16 \n\t" \ - "vmovups %%zmm2, %%zmm17 \n\t" \ - "vmovups %%zmm0, %%zmm18 \n\t" \ - "vmovups %%zmm1, %%zmm19 \n\t" \ - "vmovups %%zmm2, %%zmm20 \n\t" \ - "vmovups %%zmm0, %%zmm21 \n\t" \ - "vmovups %%zmm1, %%zmm22 \n\t" \ - "vmovups %%zmm2, %%zmm23 \n\t" - #ifdef _USE_AVX512_VNNI -#define convKernel8x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpbroadcastd 0x10("#input"), %%zmm31 \n\t" \ - "vpdpbusd "#freg0", %%zmm30, %%zmm0 \n\t" \ - "vpdpbusd "#freg1", %%zmm30, %%zmm1 \n\t" \ - "vpdpbusd "#freg2", %%zmm30, %%zmm2 \n\t" \ - "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ - "vpdpbusd "#freg0", %%zmm31, %%zmm3 \n\t" \ - "vpdpbusd "#freg1", %%zmm31, %%zmm4 \n\t" \ - "vpdpbusd "#freg2", %%zmm31, %%zmm5 \n\t" \ - "vpbroadcastd 0x20("#input"), %%zmm30 \n\t" \ - "vpbroadcastd 0x30("#input"), %%zmm31 \n\t" \ - "vpdpbusd "#freg0", %%zmm30, %%zmm6 \n\t" \ - "vpdpbusd "#freg1", %%zmm30, %%zmm7 \n\t" \ - "vpdpbusd "#freg2", %%zmm30, %%zmm8 \n\t" \ - "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ - "vpdpbusd "#freg0", %%zmm31, %%zmm9 \n\t" \ - "vpdpbusd "#freg1", %%zmm31, %%zmm10 \n\t" \ - "vpdpbusd "#freg2", %%zmm31, %%zmm11 \n\t" \ - "vpbroadcastd 0x40("#input"), %%zmm30 \n\t" \ - "vpbroadcastd 0x50("#input"), %%zmm31 \n\t" \ - "vpdpbusd "#freg0", %%zmm30, %%zmm12 \n\t" \ - "vpdpbusd "#freg1", %%zmm30, %%zmm13 \n\t" \ - "vpdpbusd "#freg2", %%zmm30, %%zmm14 \n\t" \ - "vmovups "#off2"(%[filter]), "#preg2" \n\t" \ - "vpdpbusd "#freg0", %%zmm31, %%zmm15 \n\t" \ - "vpdpbusd "#freg1", %%zmm31, %%zmm16 \n\t" \ - "vpdpbusd "#freg2", %%zmm31, %%zmm17 \n\t" \ - "vpbroadcastd 0x60("#input"), %%zmm30 \n\t" \ - "vpbroadcastd 0x70("#input"), %%zmm31 \n\t" \ - "vpdpbusd "#freg0", %%zmm30, %%zmm18 \n\t" \ - "vpdpbusd "#freg1", %%zmm30, %%zmm19 \n\t" \ - "vpdpbusd "#freg2", %%zmm30, %%zmm20 \n\t" \ - "vpdpbusd "#freg0", %%zmm31, %%zmm21 \n\t" \ - "vpdpbusd "#freg1", %%zmm31, %%zmm22 \n\t" \ - "vpdpbusd "#freg2", %%zmm31, %%zmm23 \n\t" - -#define convKernel4x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpbroadcastd 0x10("#input"), %%zmm31 \n\t" \ - "vpdpbusd "#freg0", %%zmm30, %%zmm0 \n\t" \ - "vpdpbusd "#freg1", %%zmm30, %%zmm1 \n\t" \ - "vpdpbusd "#freg2", %%zmm30, %%zmm2 \n\t" \ - "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ - "vpdpbusd "#freg0", %%zmm31, %%zmm3 \n\t" \ - "vpdpbusd "#freg1", %%zmm31, %%zmm4 \n\t" \ - "vpdpbusd "#freg2", %%zmm31, %%zmm5 \n\t" \ - "vpbroadcastd 0x20("#input"), %%zmm30 \n\t" \ - "vpbroadcastd 0x30("#input"), %%zmm31 \n\t" \ - "vpdpbusd "#freg0", %%zmm30, %%zmm6 \n\t" \ - "vpdpbusd "#freg1", %%zmm30, %%zmm7 \n\t" \ - "vpdpbusd "#freg2", %%zmm30, %%zmm8 \n\t" \ - "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ - "vpdpbusd "#freg0", %%zmm31, %%zmm9 \n\t" \ - "vpdpbusd "#freg1", %%zmm31, %%zmm10 \n\t" \ - "vpdpbusd "#freg2", %%zmm31, %%zmm11 \n\t" - -#define convKernel1x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ - "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ - "vmovups "#off2"(%[filter]), "#preg2" \n\t" \ - "vpdpbusd "#freg0", %%zmm30, %%zmm0 \n\t" \ - "vpdpbusd "#freg1", %%zmm30, %%zmm1 \n\t" \ - "vpdpbusd "#freg2", %%zmm30, %%zmm2 \n\t" +#define convKernel8x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2, \ + i0, i1, i2, i3, i4, i5, i6, i7) \ + "vpbroadcastd "#i0"("#input"), %%zmm30 \n\t" \ + "vpbroadcastd "#i1"("#input"), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm0 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm1 \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm2 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm3 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm4 \n\t" \ + "vpdpbusd "#freg2", %%zmm31, %%zmm5 \n\t" \ + "vpbroadcastd "#i2"("#input"), %%zmm30 \n\t" \ + "vpbroadcastd "#i3"("#input"), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm6 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm7 \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm8 \n\t" \ + "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm9 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm10 \n\t" \ + "vpdpbusd "#freg2", %%zmm31, %%zmm11 \n\t" \ + "vpbroadcastd "#i4"("#input"), %%zmm30 \n\t" \ + "vpbroadcastd "#i5"("#input"), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm12 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm13 \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm14 \n\t" \ + "vmovups "#off2"(%[filter]), "#preg2" \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm15 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm16 \n\t" \ + "vpdpbusd "#freg2", %%zmm31, %%zmm17 \n\t" \ + "vpbroadcastd "#i6"("#input"), %%zmm30 \n\t" \ + "vpbroadcastd "#i7"("#input"), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm18 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm19 \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm20 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm21 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm22 \n\t" \ + "vpdpbusd "#freg2", %%zmm31, %%zmm23 \n\t" + +#define convKernel4x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2, \ + i0, i1, i2, i3, i4, i5, i6, i7) \ + "vpbroadcastd "#i0"("#input"), %%zmm30 \n\t" \ + "vpbroadcastd "#i1"("#input"), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm0 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm1 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm2 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm3 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm4 \n\t" \ + "vpdpbusd "#freg2", %%zmm31, %%zmm5 \n\t" \ + "vpbroadcastd "#i2"("#input"), %%zmm30 \n\t" \ + "vpbroadcastd "#i3"("#input"), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm6 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm7 \n\t" \ + "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm8 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm9 \n\t" \ + "vmovups "#off2"(%[filter]), "#preg2" \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm10 \n\t" \ + "vpdpbusd "#freg2", %%zmm31, %%zmm11 \n\t" + +#define convKernel1x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2, \ + i0, i1, i2, i3, i4, i5, i6, i7) \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ + "vmovups "#off2"(%[filter]), "#preg2" \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm0 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm1 \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm2 \n\t" #else -#define convKernel8x48c4_3(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd 0x10("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ - "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ - "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd 0x20("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm3, "#preg0", %%zmm3 \n\t" \ - "vpaddd %%zmm4, "#preg1", %%zmm4 \n\t" \ - "vpaddd %%zmm5, "#preg2", %%zmm5 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd 0x30("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm6, "#preg0", %%zmm6 \n\t" \ - "vpaddd %%zmm7, "#preg1", %%zmm7 \n\t" \ - "vpaddd %%zmm8, "#preg2", %%zmm8 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd 0x40("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm9, "#preg0", %%zmm9 \n\t" \ - "vpaddd %%zmm10, "#preg1", %%zmm10 \n\t" \ - "vpaddd %%zmm11, "#preg2", %%zmm11 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd 0x50("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm12, "#preg0", %%zmm12 \n\t" \ - "vpaddd %%zmm13, "#preg1", %%zmm13 \n\t" \ - "vpaddd %%zmm14, "#preg2", %%zmm14 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd 0x60("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm15, "#preg0", %%zmm15 \n\t" \ - "vpaddd %%zmm16, "#preg1", %%zmm16 \n\t" \ - "vpaddd %%zmm17, "#preg2", %%zmm17 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd 0x70("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm18, "#preg0", %%zmm18 \n\t" \ - "vpaddd %%zmm19, "#preg1", %%zmm19 \n\t" \ - "vpaddd %%zmm20, "#preg2", %%zmm20 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vmovups "#off2"(%[filter]), "#freg2" \n\t" \ - "vpaddd %%zmm21, "#preg0", %%zmm21 \n\t" \ - "vpaddd %%zmm22, "#preg1", %%zmm22 \n\t" \ - "vpaddd %%zmm23, "#preg2", %%zmm23 \n\t" - -#define convKernel4x48c4_3(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd 0x10("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ - "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ - "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd 0x20("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm3, "#preg0", %%zmm3 \n\t" \ - "vpaddd %%zmm4, "#preg1", %%zmm4 \n\t" \ - "vpaddd %%zmm5, "#preg2", %%zmm5 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd 0x30("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm6, "#preg0", %%zmm6 \n\t" \ - "vpaddd %%zmm7, "#preg1", %%zmm7 \n\t" \ - "vpaddd %%zmm8, "#preg2", %%zmm8 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vmovups "#off2"(%[filter]), "#freg2" \n\t" \ - "vpaddd %%zmm9, "#preg0", %%zmm9 \n\t" \ - "vpaddd %%zmm10, "#preg1", %%zmm10 \n\t" \ - "vpaddd %%zmm11, "#preg2", %%zmm11 \n\t" - -#define convKernel1x48c4_3(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vmovups "#off2"(%[filter]), "#freg2" \n\t" \ - "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ - "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ - "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" - -#define convKernel8x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ - convKernel8x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, off0, off1, off2, %%zmm27, %%zmm28, %%zmm29) - -#define convKernel4x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ - convKernel4x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, off0, off1, off2, %%zmm27, %%zmm28, %%zmm29) - -#define convKernel1x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ - convKernel1x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, off0, off1, off2, %%zmm27, %%zmm28, %%zmm29) +#define convKernel8x48c4_3(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2, \ + i0, i1, i2, i3, i4, i5, i6, i7) \ + "vpbroadcastd "#i0"("#input"), %%zmm30 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd "#i1"("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ + "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ + "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd "#i2"("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm3, "#preg0", %%zmm3 \n\t" \ + "vpaddd %%zmm4, "#preg1", %%zmm4 \n\t" \ + "vpaddd %%zmm5, "#preg2", %%zmm5 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd "#i3"("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm6, "#preg0", %%zmm6 \n\t" \ + "vpaddd %%zmm7, "#preg1", %%zmm7 \n\t" \ + "vpaddd %%zmm8, "#preg2", %%zmm8 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd "#i4"("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm9, "#preg0", %%zmm9 \n\t" \ + "vpaddd %%zmm10, "#preg1", %%zmm10 \n\t" \ + "vpaddd %%zmm11, "#preg2", %%zmm11 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd "#i5"("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm12, "#preg0", %%zmm12 \n\t" \ + "vpaddd %%zmm13, "#preg1", %%zmm13 \n\t" \ + "vpaddd %%zmm14, "#preg2", %%zmm14 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd "#i6"("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm15, "#preg0", %%zmm15 \n\t" \ + "vpaddd %%zmm16, "#preg1", %%zmm16 \n\t" \ + "vpaddd %%zmm17, "#preg2", %%zmm17 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd "#i7"("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm18, "#preg0", %%zmm18 \n\t" \ + "vpaddd %%zmm19, "#preg1", %%zmm19 \n\t" \ + "vpaddd %%zmm20, "#preg2", %%zmm20 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vmovups "#off2"(%[filter]), "#freg2" \n\t" \ + "vpaddd %%zmm21, "#preg0", %%zmm21 \n\t" \ + "vpaddd %%zmm22, "#preg1", %%zmm22 \n\t" \ + "vpaddd %%zmm23, "#preg2", %%zmm23 \n\t" + +#define convKernel4x48c4_3(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2, \ + i0, i1, i2, i3, i4, i5, i6, i7) \ + "vpbroadcastd "#i0"("#input"), %%zmm30 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd "#i1"("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ + "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ + "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd "#i2"("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm3, "#preg0", %%zmm3 \n\t" \ + "vpaddd %%zmm4, "#preg1", %%zmm4 \n\t" \ + "vpaddd %%zmm5, "#preg2", %%zmm5 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd "#i3"("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm6, "#preg0", %%zmm6 \n\t" \ + "vpaddd %%zmm7, "#preg1", %%zmm7 \n\t" \ + "vpaddd %%zmm8, "#preg2", %%zmm8 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vmovups "#off2"(%[filter]), "#freg2" \n\t" \ + "vpaddd %%zmm9, "#preg0", %%zmm9 \n\t" \ + "vpaddd %%zmm10, "#preg1", %%zmm10 \n\t" \ + "vpaddd %%zmm11, "#preg2", %%zmm11 \n\t" + +#define convKernel1x48c4_3(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2, \ + i0, i1, i2, i3, i4, i5, i6, i7) \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vmovups "#off2"(%[filter]), "#freg2" \n\t" \ + "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ + "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ + "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" + +#define convKernel8x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2, \ + i0, i1, i2, i3, i4, i5, i6, i7) \ + convKernel8x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, off0, off1, off2, \ + %%zmm27, %%zmm28, %%zmm29, \ + i0, i1, i2, i3, i4, i5, i6, i7) + +#define convKernel4x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2, \ + i0, i1, i2, i3, i4, i5, i6, i7) \ + convKernel4x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, off0, off1, off2, \ + %%zmm27, %%zmm28, %%zmm29, \ + i0, i1, i2, i3, i4, i5, i6, i7) + +#define convKernel1x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2, \ + i0, i1, i2, i3, i4, i5, i6, i7) \ + convKernel1x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, off0, off1, off2, \ + %%zmm27, %%zmm28, %%zmm29, \ + i0, i1, i2, i3, i4, i5, i6, i7) #endif #define convKernelForLoopXx48(rnum, wsize) \ - __asm__ __volatile__("vmovups (%[filter]), %%zmm24 \n\t" \ - "vmovups 0x40(%[filter]), %%zmm25 \n\t" \ - "vmovups 0x80(%[filter]), %%zmm26 \n\t" \ - "addq $0xC0, %[filter] \n\t" \ - "mov $1, %%eax \n\t" \ - "vmovd %%eax, %%xmm0 \n\t" \ - "vpbroadcastw %%xmm0, %%zmm31 \n\t" \ - "movq %[flags], %%rax \n\t" \ - "andq $0x1, %%rax \n\t" \ - "jne 0f \n\t" \ - load48BiasTo##rnum##Regs(%[bias]) \ - "cmpq $0x10, %%rcx \n\t" \ - "jl 4f \n\t" \ - "jmp 1f \n\t" \ - ".align 16 \n\t" \ - "0: \n\t" \ - clear##rnum##Regs(%%zmm) \ - "cmpq $0x10, %%rcx \n\t" \ - "jl 4f \n\t" \ - ".align 16 \n\t" \ - "1: \n\t" \ - "movq %[input], %%rax \n\t" \ - convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26, 0x0, 0x40, 0x80, %%zmm27, %%zmm28, %%zmm29) \ - "addq $0x4, %%rax \n\t" \ - convKernel##wsize##x48c4(%%rax, %%zmm27, %%zmm28, %%zmm29, 0xC0, 0x100, 0x140, %%zmm24, %%zmm25, %%zmm26) \ - "addq $0x4, %%rax \n\t" \ - convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26, 0x180, 0x1C0, 0x200, %%zmm27, %%zmm28, %%zmm29) \ - "addq $0x4, %%rax \n\t" \ - convKernel##wsize##x48c4(%%rax, %%zmm27, %%zmm28, %%zmm29, 0x240, 0x280, 0x2C0, %%zmm24, %%zmm25, %%zmm26) \ - "addq $0x300, %[filter] \n\t" \ - "addq %[fStep], %[input] \n\t" \ - "subq $0x10, %%rcx \n\t" \ - "cmpq $0x10, %%rcx \n\t" \ - "jge 1b \n\t" \ - "subq %[fStep], %[input] \n\t" \ - "addq %[f8Step], %[input] \n\t" \ - ".align 16 \n\t" \ - "4: \n\t" \ - : "+c" (c.ic), [input] "+r" (c.input), [filter] "+r" (c.filter) \ - : [bias] "r" (c.bias), [kh] "r" (c.kh), [kw] "r" (c.kw), \ - [stepC16] "r" (c.stepC16), [fStep] "r" (c.fStep), [flags] "r" (c.flags), \ - [f8Step] "r" (c.f8Step) \ - : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \ - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", \ - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \ - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \ - "%zmm31", "memory", "cc"); \ - if (c.ic > 0) { \ - __asm__ __volatile__("cmpq $0x8, %%rcx \n\t" \ - "jl 2f \n\t" \ - "subq $0x8, %%rcx \n\t" \ - "movq %[input], %%rax \n\t" \ - convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26, 0x0, 0x40, 0x80, %%zmm27, %%zmm28, %%zmm29) \ - "addq $0x4, %%rax \n\t" \ - convKernel##wsize##x48c4(%%rax, %%zmm27, %%zmm28, %%zmm29, 0xC0, 0x100, 0x140, %%zmm24, %%zmm25, %%zmm26) \ - "addq $0x180, %[filter] \n\t" \ - "addq %[f4Step], %[input] \n\t" \ - ".align 16 \n\t" \ - "2: \n\t" \ - "cmpq $0x4, %%rcx \n\t" \ - "jl 5f \n\t" \ - convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26, 0x0, 0x40, 0x80, %%zmm27, %%zmm28, %%zmm29) \ - ".align 16 \n\t" \ - "5: \n\t" \ - : "+c" (c.ic) \ - : [input] "r" (c.input), [filter] "r" (c.filter), [bias] "r" (c.bias), [kh] "r" (c.kh), [kw] "r" (c.kw), \ - [stepC16] "r" (c.stepC16), [f4Step] "r" (c.f4Step) \ - : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \ - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", \ - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \ - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \ - "%zmm31", "memory", "cc"); \ + __asm__ __volatile__("vmovups (%[filter]), %%zmm24 \n\t" \ + "vmovups 0x40(%[filter]), %%zmm25 \n\t" \ + "vmovups 0x80(%[filter]), %%zmm26 \n\t" \ + "addq $0xC0, %[filter] \n\t" \ + "mov $1, %%eax \n\t" \ + "vmovd %%eax, %%xmm0 \n\t" \ + "vpbroadcastw %%xmm0, %%zmm31 \n\t" \ + "movq %[flags], %%rax \n\t" \ + "andq $0x1, %%rax \n\t" \ + "jne 0f \n\t" \ + load48BiasTo##rnum##Regs(%[bias]) \ + "cmpq $0x10, %%rcx \n\t" \ + "jl 4f \n\t" \ + "jmp 1f \n\t" \ + ".align 16 \n\t" \ + "0: \n\t" \ + clear##rnum##Regs(%%zmm) \ + "cmpq $0x10, %%rcx \n\t" \ + "jl 4f \n\t" \ + ".align 16 \n\t" \ + "1: \n\t" \ + "movq %[input], %%rax \n\t" \ + convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26, \ + 0x0, 0x40, 0x80, %%zmm27, %%zmm28, %%zmm29, \ + 0x0, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70) \ + "addq $0x4, %%rax \n\t" \ + convKernel##wsize##x48c4(%%rax, %%zmm27, %%zmm28, %%zmm29, \ + 0xC0, 0x100, 0x140, %%zmm24, %%zmm25, %%zmm26, \ + 0x0, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70) \ + "addq $0x4, %%rax \n\t" \ + convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26, \ + 0x180, 0x1C0, 0x200, \ + %%zmm27, %%zmm28, %%zmm29, \ + 0x0, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70) \ + "addq $0x4, %%rax \n\t" \ + convKernel##wsize##x48c4(%%rax, %%zmm27, %%zmm28, %%zmm29, \ + 0x240, 0x280, 0x2C0, \ + %%zmm24, %%zmm25, %%zmm26, \ + 0x0, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70) \ + "addq $0x300, %[filter] \n\t" \ + "addq %[fStep], %[input] \n\t" \ + "subq $0x10, %%rcx \n\t" \ + "cmpq $0x10, %%rcx \n\t" \ + "jge 1b \n\t" \ + "subq %[fStep], %[input] \n\t" \ + "addq %[f8Step], %[input] \n\t" \ + ".align 16 \n\t" \ + "4: \n\t" \ + : "+c" (c.ic), \ + [input] "+r" (c.input), \ + [filter] "+r" (c.filter) \ + : [bias] "r" (c.bias), \ + [kh] "r" (c.kh), \ + [kw] "r" (c.kw), \ + [fStep] "r" (c.fStep), \ + [flags] "r" (c.flags), \ + [f8Step] "r" (c.f8Step) \ + : "%rax", "%rbx", "%r9", \ + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", \ + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", \ + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \ + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \ + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", \ + "%zmm30", "%zmm31", "memory", "cc"); \ + if (c.ic > 0) { \ + __asm__ __volatile__("cmpq $0x8, %%rcx \n\t" \ + "jl 2f \n\t" \ + "subq $0x8, %%rcx \n\t" \ + "movq %[input], %%rax \n\t" \ + convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26, \ + 0x0, 0x40, 0x80, \ + %%zmm27, %%zmm28, %%zmm29, \ + 0x0, 0x8, 0x10, 0x18, \ + 0x20, 0x28, 0x30, 0x38) \ + "addq $0x4, %%rax \n\t" \ + convKernel##wsize##x48c4(%%rax, %%zmm27, %%zmm28, %%zmm29, \ + 0xC0, 0x100, 0x140, \ + %%zmm24, %%zmm25, %%zmm26, \ + 0x0, 0x8, 0x10, 0x18, \ + 0x20, 0x28, 0x30, 0x38) \ + "addq $0x180, %[filter] \n\t" \ + "addq %[f4Step], %[input] \n\t" \ + ".align 16 \n\t" \ + "2: \n\t" \ + "cmpq $0x4, %%rcx \n\t" \ + "jl 5f \n\t" \ + convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26, \ + 0x0, 0x40, 0x80, \ + %%zmm27, %%zmm28, %%zmm29, \ + 0x0, 0x4, 0x8, 0xC, \ + 0x10, 0x14, 0x18, 0x1C) \ + ".align 16 \n\t" \ + "5: \n\t" \ + : "+c" (c.ic) \ + : [input] "r" (c.input), \ + [filter] "r" (c.filter), \ + [bias] "r" (c.bias), \ + [kh] "r" (c.kh), \ + [kw] "r" (c.kw), \ + [f4Step] "r" (c.f4Step) \ + : "%rax", "%rbx", "%r9", \ + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", \ + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", \ + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \ + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \ + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", \ + "%zmm30", "%zmm31", "memory", "cc"); \ } void Avx512Conv1x1Kernel8x48(ConvController &c) { convKernelForLoopXx48(24, 8) - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" - "je 0f \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd 0x40(%%rax), %%zmm3, %%zmm3 \n\t" - "vpaddd 0x80(%%rax), %%zmm6, %%zmm6 \n\t" - "vpaddd 0xC0(%%rax), %%zmm9, %%zmm9 \n\t" - "vpaddd 0x100(%%rax), %%zmm12, %%zmm12 \n\t" - "vpaddd 0x140(%%rax), %%zmm15, %%zmm15 \n\t" - "vpaddd 0x180(%%rax), %%zmm18, %%zmm18 \n\t" - "vpaddd 0x1C0(%%rax), %%zmm21, %%zmm21 \n\t" - "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" - "vpaddd 0x40(%%rax, %%rbx), %%zmm4, %%zmm4 \n\t" - "vpaddd 0x80(%%rax, %%rbx), %%zmm7, %%zmm7 \n\t" - "vpaddd 0xC0(%%rax, %%rbx), %%zmm10, %%zmm10 \n\t" - "vpaddd 0x100(%%rax, %%rbx), %%zmm13, %%zmm13 \n\t" - "vpaddd 0x140(%%rax, %%rbx), %%zmm16, %%zmm16 \n\t" - "vpaddd 0x180(%%rax, %%rbx), %%zmm19, %%zmm19 \n\t" - "vpaddd 0x1C0(%%rax, %%rbx), %%zmm22, %%zmm22 \n\t" - "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2 \n\t" - "vpaddd 0x40(%%rax, %%rbx, 2), %%zmm5, %%zmm5 \n\t" - "vpaddd 0x80(%%rax, %%rbx, 2), %%zmm8, %%zmm8 \n\t" - "vpaddd 0xC0(%%rax, %%rbx, 2), %%zmm11, %%zmm11 \n\t" - "vpaddd 0x100(%%rax, %%rbx, 2), %%zmm14, %%zmm14 \n\t" - "vpaddd 0x140(%%rax, %%rbx, 2), %%zmm17, %%zmm17 \n\t" - "vpaddd 0x180(%%rax, %%rbx, 2), %%zmm20, %%zmm20 \n\t" - "vpaddd 0x1C0(%%rax, %%rbx, 2), %%zmm23, %%zmm23 \n\t" - - ".align 16 \n\t" - "0: \n\t" - "movq %[flags], %%rcx \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" + "je 0f \n\t" + "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" + "vpaddd 0x40(%%rax), %%zmm3, %%zmm3 \n\t" + "vpaddd 0x80(%%rax), %%zmm6, %%zmm6 \n\t" + "vpaddd 0xC0(%%rax), %%zmm9, %%zmm9 \n\t" + "vpaddd 0x100(%%rax), %%zmm12, %%zmm12 \n\t" + "vpaddd 0x140(%%rax), %%zmm15, %%zmm15 \n\t" + "vpaddd 0x180(%%rax), %%zmm18, %%zmm18 \n\t" + "vpaddd 0x1C0(%%rax), %%zmm21, %%zmm21 \n\t" + "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" + "vpaddd 0x40(%%rax, %%rbx), %%zmm4, %%zmm4 \n\t" + "vpaddd 0x80(%%rax, %%rbx), %%zmm7, %%zmm7 \n\t" + "vpaddd 0xC0(%%rax, %%rbx), %%zmm10, %%zmm10 \n\t" + "vpaddd 0x100(%%rax, %%rbx), %%zmm13, %%zmm13 \n\t" + "vpaddd 0x140(%%rax, %%rbx), %%zmm16, %%zmm16 \n\t" + "vpaddd 0x180(%%rax, %%rbx), %%zmm19, %%zmm19 \n\t" + "vpaddd 0x1C0(%%rax, %%rbx), %%zmm22, %%zmm22 \n\t" + "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2 \n\t" + "vpaddd 0x40(%%rax, %%rbx, 2), %%zmm5, %%zmm5 \n\t" + "vpaddd 0x80(%%rax, %%rbx, 2), %%zmm8, %%zmm8 \n\t" + "vpaddd 0xC0(%%rax, %%rbx, 2), %%zmm11, %%zmm11 \n\t" + "vpaddd 0x100(%%rax, %%rbx, 2), %%zmm14, %%zmm14 \n\t" + "vpaddd 0x140(%%rax, %%rbx, 2), %%zmm17, %%zmm17 \n\t" + "vpaddd 0x180(%%rax, %%rbx, 2), %%zmm20, %%zmm20 \n\t" + "vpaddd 0x1C0(%%rax, %%rbx, 2), %%zmm23, %%zmm23 \n\t" + + ".align 16 \n\t" + "0: \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" "and $0xC, %%rcx \n\t" - "je 1f \n\t" + "je 4f \n\t" relu24Regs(%%zmm) + "jmp 4f \n\t" - ".align 16 \n\t" - "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" + ".align 16 \n\t" + "1: \n\t" convert24RegsI32ToF32(%[scale], %%zmm) - ".align 16 \n\t" - "2: \n\t" - "vmovups %%zmm0, (%%rax) \n\t" - "vmovups %%zmm3, 0x40(%%rax) \n\t" - "vmovups %%zmm6, 0x80(%%rax) \n\t" - "vmovups %%zmm9, 0xC0(%%rax) \n\t" - "vmovups %%zmm12, 0x100(%%rax) \n\t" - "vmovups %%zmm15, 0x140(%%rax) \n\t" - "vmovups %%zmm18, 0x180(%%rax) \n\t" - "vmovups %%zmm21, 0x1C0(%%rax) \n\t" - "vmovups %%zmm1, (%%rax, %%rbx) \n\t" - "vmovups %%zmm4, 0x40(%%rax, %%rbx) \n\t" - "vmovups %%zmm7, 0x80(%%rax, %%rbx) \n\t" - "vmovups %%zmm10, 0xC0(%%rax, %%rbx) \n\t" - "vmovups %%zmm13, 0x100(%%rax, %%rbx) \n\t" - "vmovups %%zmm16, 0x140(%%rax, %%rbx) \n\t" - "vmovups %%zmm19, 0x180(%%rax, %%rbx) \n\t" - "vmovups %%zmm22, 0x1C0(%%rax, %%rbx) \n\t" - "vmovups %%zmm2, (%%rax, %%rbx, 2) \n\t" - "vmovups %%zmm5, 0x40(%%rax, %%rbx, 2) \n\t" - "vmovups %%zmm8, 0x80(%%rax, %%rbx, 2) \n\t" - "vmovups %%zmm11, 0xC0(%%rax, %%rbx, 2) \n\t" - "vmovups %%zmm14, 0x100(%%rax, %%rbx, 2) \n\t" - "vmovups %%zmm17, 0x140(%%rax, %%rbx, 2) \n\t" - "vmovups %%zmm20, 0x180(%%rax, %%rbx, 2) \n\t" - "vmovups %%zmm23, 0x1C0(%%rax, %%rbx, 2) \n\t" + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + "vaddps 0x40(%[eltwise]), %%zmm3, %%zmm3 \n\t" + "vaddps 0x80(%[eltwise]), %%zmm6, %%zmm6 \n\t" + "vaddps 0xC0(%[eltwise]), %%zmm9, %%zmm9 \n\t" + "vaddps 0x100(%[eltwise]), %%zmm12, %%zmm12 \n\t" + "vaddps 0x140(%[eltwise]), %%zmm15, %%zmm15 \n\t" + "vaddps 0x180(%[eltwise]), %%zmm18, %%zmm18 \n\t" + "vaddps 0x1C0(%[eltwise]), %%zmm21, %%zmm21 \n\t" + "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1 \n\t" + "vaddps 0x40(%[eltwise], %%rbx), %%zmm4, %%zmm4 \n\t" + "vaddps 0x80(%[eltwise], %%rbx), %%zmm7, %%zmm7 \n\t" + "vaddps 0xC0(%[eltwise], %%rbx), %%zmm10, %%zmm10 \n\t" + "vaddps 0x100(%[eltwise], %%rbx), %%zmm13, %%zmm13 \n\t" + "vaddps 0x140(%[eltwise], %%rbx), %%zmm16, %%zmm16 \n\t" + "vaddps 0x180(%[eltwise], %%rbx), %%zmm19, %%zmm19 \n\t" + "vaddps 0x1C0(%[eltwise], %%rbx), %%zmm22, %%zmm22 \n\t" + "vaddps (%[eltwise], %%rbx, 2), %%zmm2, %%zmm2 \n\t" + "vaddps 0x40(%[eltwise], %%rbx, 2), %%zmm5, %%zmm5 \n\t" + "vaddps 0x80(%[eltwise], %%rbx, 2), %%zmm8, %%zmm8 \n\t" + "vaddps 0xC0(%[eltwise], %%rbx, 2), %%zmm11, %%zmm11 \n\t" + "vaddps 0x100(%[eltwise], %%rbx, 2), %%zmm14, %%zmm14 \n\t" + "vaddps 0x140(%[eltwise], %%rbx, 2), %%zmm17, %%zmm17 \n\t" + "vaddps 0x180(%[eltwise], %%rbx, 2), %%zmm20, %%zmm20 \n\t" + "vaddps 0x1C0(%[eltwise], %%rbx, 2), %%zmm23, %%zmm23 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + relu24RegsPs(%%zmm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%zmm0, (%%rax) \n\t" + "vmovups %%zmm3, 0x40(%%rax) \n\t" + "vmovups %%zmm6, 0x80(%%rax) \n\t" + "vmovups %%zmm9, 0xC0(%%rax) \n\t" + "vmovups %%zmm12, 0x100(%%rax) \n\t" + "vmovups %%zmm15, 0x140(%%rax) \n\t" + "vmovups %%zmm18, 0x180(%%rax) \n\t" + "vmovups %%zmm21, 0x1C0(%%rax) \n\t" + "vmovups %%zmm1, (%%rax, %%rbx) \n\t" + "vmovups %%zmm4, 0x40(%%rax, %%rbx) \n\t" + "vmovups %%zmm7, 0x80(%%rax, %%rbx) \n\t" + "vmovups %%zmm10, 0xC0(%%rax, %%rbx) \n\t" + "vmovups %%zmm13, 0x100(%%rax, %%rbx) \n\t" + "vmovups %%zmm16, 0x140(%%rax, %%rbx) \n\t" + "vmovups %%zmm19, 0x180(%%rax, %%rbx) \n\t" + "vmovups %%zmm22, 0x1C0(%%rax, %%rbx) \n\t" + "vmovups %%zmm2, (%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm5, 0x40(%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm8, 0x80(%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm11, 0xC0(%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm14, 0x100(%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm17, 0x140(%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm20, 0x180(%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm23, 0x1C0(%%rax, %%rbx, 2) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [eltwise] "r" (c.eltwise), + [ostepC16] "r" (c.ostepC16), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", + "%zmm30", "%zmm31", "memory", "cc"); } void Avx512Conv1x1Kernel4x48(ConvController &c) { convKernelForLoopXx48(12, 4) - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" - "je 0f \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd 0x40(%%rax), %%zmm3, %%zmm3 \n\t" - "vpaddd 0x80(%%rax), %%zmm6, %%zmm6 \n\t" - "vpaddd 0xC0(%%rax), %%zmm9, %%zmm9 \n\t" - "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" - "vpaddd 0x40(%%rax, %%rbx), %%zmm4, %%zmm4 \n\t" - "vpaddd 0x80(%%rax, %%rbx), %%zmm7, %%zmm7 \n\t" - "vpaddd 0xC0(%%rax, %%rbx), %%zmm10, %%zmm10 \n\t" - "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2 \n\t" - "vpaddd 0x40(%%rax, %%rbx, 2), %%zmm5, %%zmm5 \n\t" - "vpaddd 0x80(%%rax, %%rbx, 2), %%zmm8, %%zmm8 \n\t" - "vpaddd 0xC0(%%rax, %%rbx, 2), %%zmm11, %%zmm11 \n\t" - - ".align 16 \n\t" - "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" + "je 0f \n\t" + "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" + "vpaddd 0x40(%%rax), %%zmm3, %%zmm3 \n\t" + "vpaddd 0x80(%%rax), %%zmm6, %%zmm6 \n\t" + "vpaddd 0xC0(%%rax), %%zmm9, %%zmm9 \n\t" + "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" + "vpaddd 0x40(%%rax, %%rbx), %%zmm4, %%zmm4 \n\t" + "vpaddd 0x80(%%rax, %%rbx), %%zmm7, %%zmm7 \n\t" + "vpaddd 0xC0(%%rax, %%rbx), %%zmm10, %%zmm10 \n\t" + "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2 \n\t" + "vpaddd 0x40(%%rax, %%rbx, 2), %%zmm5, %%zmm5 \n\t" + "vpaddd 0x80(%%rax, %%rbx, 2), %%zmm8, %%zmm8 \n\t" + "vpaddd 0xC0(%%rax, %%rbx, 2), %%zmm11, %%zmm11 \n\t" + + ".align 16 \n\t" + "0: \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" relu12Regs(%%zmm) + "jmp 4f \n\t" - ".align 16 \n\t" - "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" + ".align 16 \n\t" + "1: \n\t" convert12RegsI32ToF32(%[scale], %%zmm) - ".align 16 \n\t" - "2: \n\t" + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + "vaddps 0x40(%[eltwise]), %%zmm3, %%zmm3 \n\t" + "vaddps 0x80(%[eltwise]), %%zmm6, %%zmm6 \n\t" + "vaddps 0xC0(%[eltwise]), %%zmm9, %%zmm9 \n\t" + "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1 \n\t" + "vaddps 0x40(%[eltwise], %%rbx), %%zmm4, %%zmm4 \n\t" + "vaddps 0x80(%[eltwise], %%rbx), %%zmm7, %%zmm7 \n\t" + "vaddps 0xC0(%[eltwise], %%rbx), %%zmm10, %%zmm10 \n\t" + "vaddps (%[eltwise], %%rbx, 2), %%zmm2, %%zmm2 \n\t" + "vaddps 0x40(%[eltwise], %%rbx, 2), %%zmm5, %%zmm5 \n\t" + "vaddps 0x80(%[eltwise], %%rbx, 2), %%zmm8, %%zmm8 \n\t" + "vaddps 0xC0(%[eltwise], %%rbx, 2), %%zmm11, %%zmm11 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + relu12RegsPs(%%zmm) + + ".align 16 \n\t" + "4: \n\t" "vmovups %%zmm0, (%%rax) \n\t" "vmovups %%zmm3, 0x40(%%rax) \n\t" "vmovups %%zmm6, 0x80(%%rax) \n\t" "vmovups %%zmm9, 0xC0(%%rax) \n\t" - "vmovups %%zmm1, (%%rax, %%rbx) \n\t" - "vmovups %%zmm4, 0x40(%%rax, %%rbx) \n\t" - "vmovups %%zmm7, 0x80(%%rax, %%rbx) \n\t" - "vmovups %%zmm10, 0xC0(%%rax, %%rbx) \n\t" - "vmovups %%zmm2, (%%rax, %%rbx, 2) \n\t" - "vmovups %%zmm5, 0x40(%%rax, %%rbx, 2) \n\t" - "vmovups %%zmm8, 0x80(%%rax, %%rbx, 2) \n\t" - "vmovups %%zmm11, 0xC0(%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm1, (%%rax, %%rbx) \n\t" + "vmovups %%zmm4, 0x40(%%rax, %%rbx) \n\t" + "vmovups %%zmm7, 0x80(%%rax, %%rbx) \n\t" + "vmovups %%zmm10, 0xC0(%%rax, %%rbx) \n\t" + "vmovups %%zmm2, (%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm5, 0x40(%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm8, 0x80(%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm11, 0xC0(%%rax, %%rbx, 2) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [eltwise] "r" (c.eltwise), + [ostepC16] "r" (c.ostepC16), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", + "%zmm24", "%zmm31", "memory", "cc"); + } void Avx512Conv1x1Kernel1x48(ConvController &c) { convKernelForLoopXx48(3, 1) - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" - "je 0f \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" - "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2 \n\t" - - ".align 16 \n\t" - "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" + "je 0f \n\t" + "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" + "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" + "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2 \n\t" + + ".align 16 \n\t" + "0: \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" relu3Regs(%%zmm) + "jmp 4f \n\t" - ".align 16 \n\t" - "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" + ".align 16 \n\t" + "1: \n\t" convert3RegsI32ToF32(%[scale], %%zmm) - ".align 16 \n\t" - "2: \n\t" - "vmovups %%zmm0, (%%rax) \n\t" - "vmovups %%zmm1, (%%rax, %%rbx) \n\t" - "vmovups %%zmm2, (%%rax, %%rbx, 2) \n\t" + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1 \n\t" + "vaddps (%[eltwise], %%rbx, 2), %%zmm2, %%zmm2 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + relu3RegsPs(%%zmm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%zmm0, (%%rax) \n\t" + "vmovups %%zmm1, (%%rax, %%rbx) \n\t" + "vmovups %%zmm2, (%%rax, %%rbx, 2) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [ostepC16] "r" (c.ostepC16), + [eltwise] "r" (c.eltwise), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%zmm0", "%zmm1", "%zmm2", + "%zmm24", "%zmm31", "memory", "cc"); } -#define load32BiasTo2Regs(bias) \ - "vmovups ("#bias"), %%zmm0 \n\t" \ - "vmovups 0x40("#bias"), %%zmm1 \n\t" \ - -#define load32BiasTo12Regs(bias) \ - load32BiasTo2Regs(bias) \ - "vmovups %%zmm0, %%zmm2 \n\t" \ - "vmovups %%zmm1, %%zmm3 \n\t" \ - "vmovups %%zmm0, %%zmm4 \n\t" \ - "vmovups %%zmm1, %%zmm5 \n\t" \ - "vmovups %%zmm0, %%zmm6 \n\t" \ - "vmovups %%zmm1, %%zmm7 \n\t" \ - "vmovups %%zmm0, %%zmm8 \n\t" \ - "vmovups %%zmm1, %%zmm9 \n\t" \ - "vmovups %%zmm0, %%zmm10 \n\t" \ - "vmovups %%zmm1, %%zmm11 \n\t" - -#define load32BiasTo24Regs(bias) \ - load32BiasTo12Regs(bias) \ - "vmovups %%zmm0, %%zmm12 \n\t" \ - "vmovups %%zmm1, %%zmm13 \n\t" \ - "vmovups %%zmm0, %%zmm14 \n\t" \ - "vmovups %%zmm1, %%zmm15 \n\t" \ - "vmovups %%zmm0, %%zmm16 \n\t" \ - "vmovups %%zmm1, %%zmm17 \n\t" \ - "vmovups %%zmm0, %%zmm18 \n\t" \ - "vmovups %%zmm1, %%zmm19 \n\t" \ - "vmovups %%zmm0, %%zmm20 \n\t" \ - "vmovups %%zmm1, %%zmm21 \n\t" \ - "vmovups %%zmm0, %%zmm22 \n\t" \ - "vmovups %%zmm1, %%zmm23 \n\t" - #ifdef _USE_AVX512_VNNI -#define convKernel12x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \ - "vpbroadcastd ("#input"), %%zmm28 \n\t" \ - "vpbroadcastd 0x10("#input"), %%zmm29 \n\t" \ +#define convKernel12x32c4(input, freg0, freg1, off0, off1, preg0, preg1, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) \ + "vpbroadcastd "#i0"("#input"), %%zmm28 \n\t" \ + "vpbroadcastd "#i1"("#input"), %%zmm29 \n\t" \ "vpdpbusd "#freg0", %%zmm28, %%zmm0 \n\t" \ "vpdpbusd "#freg1", %%zmm28, %%zmm1 \n\t" \ - "vpbroadcastd 0x20("#input"), %%zmm30 \n\t" \ - "vpbroadcastd 0x30("#input"), %%zmm31 \n\t" \ + "vpbroadcastd "#i2"("#input"), %%zmm30 \n\t" \ + "vpbroadcastd "#i3"("#input"), %%zmm31 \n\t" \ "vpdpbusd "#freg0", %%zmm29, %%zmm2 \n\t" \ "vpdpbusd "#freg1", %%zmm29, %%zmm3 \n\t" \ "vpdpbusd "#freg0", %%zmm30, %%zmm4 \n\t" \ "vpdpbusd "#freg1", %%zmm30, %%zmm5 \n\t" \ - "vpbroadcastd 0x40("#input"), %%zmm28 \n\t" \ - "vpbroadcastd 0x50("#input"), %%zmm29 \n\t" \ + "vpbroadcastd "#i4"("#input"), %%zmm28 \n\t" \ + "vpbroadcastd "#i5"("#input"), %%zmm29 \n\t" \ "vpdpbusd "#freg0", %%zmm31, %%zmm6 \n\t" \ "vpdpbusd "#freg1", %%zmm31, %%zmm7 \n\t" \ - "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ "vpdpbusd "#freg0", %%zmm28, %%zmm8 \n\t" \ "vpdpbusd "#freg1", %%zmm28, %%zmm9 \n\t" \ - "vpbroadcastd 0x60("#input"), %%zmm30 \n\t" \ - "vpbroadcastd 0x70("#input"), %%zmm31 \n\t" \ - "vpdpbusd "#freg0", %%zmm29, %%zmm10 \n\t" \ - "vpdpbusd "#freg1", %%zmm29, %%zmm11 \n\t" \ - "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ - "vpdpbusd "#freg0", %%zmm30, %%zmm12 \n\t" \ - "vpdpbusd "#freg1", %%zmm30, %%zmm13 \n\t" \ - "vpbroadcastd 0x80("#input"), %%zmm28 \n\t" \ - "vpbroadcastd 0x90("#input"), %%zmm29 \n\t" \ - "vpdpbusd "#freg0", %%zmm31, %%zmm14 \n\t" \ - "vpdpbusd "#freg1", %%zmm31, %%zmm15 \n\t" \ - "vpdpbusd "#freg0", %%zmm28, %%zmm16 \n\t" \ - "vpdpbusd "#freg1", %%zmm28, %%zmm17 \n\t" \ - "vpbroadcastd 0xA0("#input"), %%zmm30 \n\t" \ - "vpbroadcastd 0xB0("#input"), %%zmm31 \n\t" \ - "vpdpbusd "#freg0", %%zmm29, %%zmm18 \n\t" \ - "vpdpbusd "#freg1", %%zmm29, %%zmm19 \n\t" \ - "vpdpbusd "#freg0", %%zmm30, %%zmm20 \n\t" \ - "vpdpbusd "#freg1", %%zmm30, %%zmm21 \n\t" \ - "vpdpbusd "#freg0", %%zmm31, %%zmm22 \n\t" \ - "vpdpbusd "#freg1", %%zmm31, %%zmm23 \n\t" - -#define convKernel6x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \ - "vpbroadcastd ("#input"), %%zmm28 \n\t" \ - "vpbroadcastd 0x10("#input"), %%zmm29 \n\t" \ - "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vpbroadcastd "#i6"("#input"), %%zmm30 \n\t" \ + "vpbroadcastd "#i7"("#input"), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm29, %%zmm10 \n\t" \ + "vpdpbusd "#freg1", %%zmm29, %%zmm11 \n\t" \ + "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm12 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm13 \n\t" \ + "vpbroadcastd "#i8"("#input"), %%zmm28 \n\t" \ + "vpbroadcastd "#i9"("#input"), %%zmm29 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm14 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm15 \n\t" \ + "vpdpbusd "#freg0", %%zmm28, %%zmm16 \n\t" \ + "vpdpbusd "#freg1", %%zmm28, %%zmm17 \n\t" \ + "vpbroadcastd "#i10"("#input"), %%zmm30 \n\t" \ + "vpbroadcastd "#i11"("#input"), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm29, %%zmm18 \n\t" \ + "vpdpbusd "#freg1", %%zmm29, %%zmm19 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm20 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm21 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm22 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm23 \n\t" + +#define convKernel6x32c4(input, freg0, freg1, off0, off1, preg0, preg1, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) \ + "vpbroadcastd "#i0"("#input"), %%zmm28 \n\t" \ + "vpbroadcastd "#i1"("#input"), %%zmm29 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ "vpdpbusd "#freg0", %%zmm28, %%zmm0 \n\t" \ "vpdpbusd "#freg1", %%zmm28, %%zmm1 \n\t" \ - "vpbroadcastd 0x20("#input"), %%zmm30 \n\t" \ - "vpbroadcastd 0x30("#input"), %%zmm31 \n\t" \ + "vpbroadcastd "#i2"("#input"), %%zmm30 \n\t" \ + "vpbroadcastd "#i3"("#input"), %%zmm31 \n\t" \ "vpdpbusd "#freg0", %%zmm29, %%zmm2 \n\t" \ "vpdpbusd "#freg1", %%zmm29, %%zmm3 \n\t" \ "vpdpbusd "#freg0", %%zmm30, %%zmm4 \n\t" \ "vpdpbusd "#freg1", %%zmm30, %%zmm5 \n\t" \ - "vpbroadcastd 0x40("#input"), %%zmm28 \n\t" \ - "vpbroadcastd 0x50("#input"), %%zmm29 \n\t" \ + "vpbroadcastd "#i4"("#input"), %%zmm28 \n\t" \ + "vpbroadcastd "#i5"("#input"), %%zmm29 \n\t" \ "vpdpbusd "#freg0", %%zmm31, %%zmm6 \n\t" \ "vpdpbusd "#freg1", %%zmm31, %%zmm7 \n\t" \ - "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ + "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ "vpdpbusd "#freg0", %%zmm28, %%zmm8 \n\t" \ "vpdpbusd "#freg1", %%zmm28, %%zmm9 \n\t" \ - "vpdpbusd "#freg0", %%zmm29, %%zmm10 \n\t" \ - "vpdpbusd "#freg1", %%zmm29, %%zmm11 \n\t" - -#define convKernel1x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \ - "vpbroadcastd ("#input"), %%zmm28 \n\t" \ - "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ - "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ + "vpdpbusd "#freg0", %%zmm29, %%zmm10 \n\t" \ + "vpdpbusd "#freg1", %%zmm29, %%zmm11 \n\t" + +#define convKernel1x32c4(input, freg0, freg1, off0, off1, preg0, preg1, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) \ + "vpbroadcastd ("#input"), %%zmm28 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ "vpdpbusd "#freg0", %%zmm28, %%zmm0 \n\t" \ "vpdpbusd "#freg1", %%zmm28, %%zmm1 \n\t" #else -#define convKernel12x32c4_3(input, freg0, freg1, off0, off1, preg0, preg1, preg2) \ - "vpbroadcastd ("#input"), %%zmm29 \n\t" \ - "vpbroadcastd 0x10("#input"), %%zmm30 \n\t" \ - "vpmaddubsw "#freg0", %%zmm29, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm29, "#preg1" \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd 0x20("#input"), %%zmm29 \n\t" \ - "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ - "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ - "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg0", %%zmm29, "#preg1" \n\t" \ - "vpmaddubsw "#freg1", %%zmm29, "#preg2" \n\t" \ - "vpbroadcastd 0x30("#input"), %%zmm30 \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd 0x40("#input"), %%zmm29 \n\t" \ - "vpaddd %%zmm3, "#preg0", %%zmm3 \n\t" \ - "vpaddd %%zmm4, "#preg1", %%zmm4 \n\t" \ - "vpaddd %%zmm5, "#preg2", %%zmm5 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg0", %%zmm29, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd 0x50("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm6, "#preg0", %%zmm6 \n\t" \ - "vpaddd %%zmm7, "#preg1", %%zmm7 \n\t" \ - "vpaddd %%zmm8, "#preg2", %%zmm8 \n\t" \ - "vpmaddubsw "#freg1", %%zmm29, "#preg0" \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg2" \n\t" \ - "vpbroadcastd 0x60("#input"), %%zmm29 \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd 0x70("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm9, "#preg0", %%zmm9 \n\t" \ - "vpaddd %%zmm10, "#preg1", %%zmm10 \n\t" \ - "vpaddd %%zmm11, "#preg2", %%zmm11 \n\t" \ - "vpmaddubsw "#freg0", %%zmm29, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm29, "#preg1" \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd 0x80("#input"), %%zmm29 \n\t" \ - "vpaddd %%zmm12, "#preg0", %%zmm12 \n\t" \ - "vpaddd %%zmm13, "#preg1", %%zmm13 \n\t" \ - "vpaddd %%zmm14, "#preg2", %%zmm14 \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg0", %%zmm29, "#preg1" \n\t" \ - "vpmaddubsw "#freg1", %%zmm29, "#preg2" \n\t" \ - "vpbroadcastd 0x90("#input"), %%zmm30 \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd 0xA0("#input"), %%zmm29 \n\t" \ - "vpaddd %%zmm15, "#preg0", %%zmm15 \n\t" \ - "vpaddd %%zmm16, "#preg1", %%zmm16 \n\t" \ - "vpaddd %%zmm17, "#preg2", %%zmm17 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg0", %%zmm29, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd 0xB0("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm18, "#preg0", %%zmm18 \n\t" \ - "vpaddd %%zmm19, "#preg1", %%zmm19 \n\t" \ - "vpaddd %%zmm20, "#preg2", %%zmm20 \n\t" \ - "vpmaddubsw "#freg1", %%zmm29, "#preg0" \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ - "vpaddd %%zmm21, "#preg0", %%zmm21 \n\t" \ - "vpaddd %%zmm22, "#preg1", %%zmm22 \n\t" \ - "vpaddd %%zmm23, "#preg2", %%zmm23 \n\t" - -#define convKernel6x32c4_3(input, freg0, freg1, off0, off1, preg0, preg1, preg2) \ - "vpbroadcastd ("#input"), %%zmm29 \n\t" \ - "vpbroadcastd 0x10("#input"), %%zmm30 \n\t" \ - "vpmaddubsw "#freg0", %%zmm29, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm29, "#preg1" \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd 0x20("#input"), %%zmm29 \n\t" \ - "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ - "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ - "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg0", %%zmm29, "#preg1" \n\t" \ - "vpmaddubsw "#freg1", %%zmm29, "#preg2" \n\t" \ - "vpbroadcastd 0x30("#input"), %%zmm30 \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd 0x40("#input"), %%zmm29 \n\t" \ - "vpaddd %%zmm3, "#preg0", %%zmm3 \n\t" \ - "vpaddd %%zmm4, "#preg1", %%zmm4 \n\t" \ - "vpaddd %%zmm5, "#preg2", %%zmm5 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg0", %%zmm29, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd 0x50("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm6, "#preg0", %%zmm6 \n\t" \ - "vpaddd %%zmm7, "#preg1", %%zmm7 \n\t" \ - "vpaddd %%zmm8, "#preg2", %%zmm8 \n\t" \ - "vpmaddubsw "#freg1", %%zmm29, "#preg0" \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ - "vpaddd %%zmm9, "#preg0", %%zmm9 \n\t" \ - "vpaddd %%zmm10, "#preg1", %%zmm10 \n\t" \ - "vpaddd %%zmm11, "#preg2", %%zmm11 \n\t" - -#define convKernel1x32c4_3(input, freg0, freg1, off0, off1, preg0, preg1, preg2) \ - "vpbroadcastd ("#input"), %%zmm29 \n\t" \ - "vpmaddubsw "#freg0", %%zmm29, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm29, "#preg1" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ - "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ - "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" - -#define convKernel12x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \ - convKernel12x32c4_3(input, %%zmm24, %%zmm25, off0, off1, %%zmm26, %%zmm27, %%zmm28) - -#define convKernel6x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \ - convKernel6x32c4_3(input, %%zmm24, %%zmm25, off0, off1, %%zmm26, %%zmm27, %%zmm28) - -#define convKernel1x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \ - convKernel1x32c4_3(input, %%zmm24, %%zmm25, off0, off1, %%zmm26, %%zmm27, %%zmm28) +#define convKernel12x32c4_3(input, freg0, freg1, off0, off1, preg0, preg1, preg2, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) \ + "vpbroadcastd "#i0"("#input"), %%zmm29 \n\t" \ + "vpbroadcastd "#i1"("#input"), %%zmm30 \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg1" \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd "#i2"("#input"), %%zmm29 \n\t" \ + "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ + "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ + "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg1" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg2" \n\t" \ + "vpbroadcastd "#i3"("#input"), %%zmm30 \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd "#i4"("#input"), %%zmm29 \n\t" \ + "vpaddd %%zmm3, "#preg0", %%zmm3 \n\t" \ + "vpaddd %%zmm4, "#preg1", %%zmm4 \n\t" \ + "vpaddd %%zmm5, "#preg2", %%zmm5 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd "#i5"("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm6, "#preg0", %%zmm6 \n\t" \ + "vpaddd %%zmm7, "#preg1", %%zmm7 \n\t" \ + "vpaddd %%zmm8, "#preg2", %%zmm8 \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg2" \n\t" \ + "vpbroadcastd "#i6"("#input"), %%zmm29 \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd "#i7"("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm9, "#preg0", %%zmm9 \n\t" \ + "vpaddd %%zmm10, "#preg1", %%zmm10 \n\t" \ + "vpaddd %%zmm11, "#preg2", %%zmm11 \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg1" \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd "#i8"("#input"), %%zmm29 \n\t" \ + "vpaddd %%zmm12, "#preg0", %%zmm12 \n\t" \ + "vpaddd %%zmm13, "#preg1", %%zmm13 \n\t" \ + "vpaddd %%zmm14, "#preg2", %%zmm14 \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg1" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg2" \n\t" \ + "vpbroadcastd "#i9"("#input"), %%zmm30 \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd "#i10"("#input"), %%zmm29 \n\t" \ + "vpaddd %%zmm15, "#preg0", %%zmm15 \n\t" \ + "vpaddd %%zmm16, "#preg1", %%zmm16 \n\t" \ + "vpaddd %%zmm17, "#preg2", %%zmm17 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd "#i11"("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm18, "#preg0", %%zmm18 \n\t" \ + "vpaddd %%zmm19, "#preg1", %%zmm19 \n\t" \ + "vpaddd %%zmm20, "#preg2", %%zmm20 \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ + "vpaddd %%zmm21, "#preg0", %%zmm21 \n\t" \ + "vpaddd %%zmm22, "#preg1", %%zmm22 \n\t" \ + "vpaddd %%zmm23, "#preg2", %%zmm23 \n\t" + +#define convKernel6x32c4_3(input, freg0, freg1, off0, off1, preg0, preg1, preg2, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) \ + "vpbroadcastd "#i0"("#input"), %%zmm29 \n\t" \ + "vpbroadcastd "#i1"("#input"), %%zmm30 \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg1" \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd "#i2"("#input"), %%zmm29 \n\t" \ + "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ + "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ + "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg1" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg2" \n\t" \ + "vpbroadcastd "#i3"("#input"), %%zmm30 \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd "#i4"("#input"), %%zmm29 \n\t" \ + "vpaddd %%zmm3, "#preg0", %%zmm3 \n\t" \ + "vpaddd %%zmm4, "#preg1", %%zmm4 \n\t" \ + "vpaddd %%zmm5, "#preg2", %%zmm5 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd "#i5"("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm6, "#preg0", %%zmm6 \n\t" \ + "vpaddd %%zmm7, "#preg1", %%zmm7 \n\t" \ + "vpaddd %%zmm8, "#preg2", %%zmm8 \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ + "vpaddd %%zmm9, "#preg0", %%zmm9 \n\t" \ + "vpaddd %%zmm10, "#preg1", %%zmm10 \n\t" \ + "vpaddd %%zmm11, "#preg2", %%zmm11 \n\t" + +#define convKernel1x32c4_3(input, freg0, freg1, off0, off1, preg0, preg1, preg2, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg1" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ + "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ + "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" + +#define convKernel12x32c4(input, freg0, freg1, off0, off1, preg0, preg1, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) \ + convKernel12x32c4_3(input, %%zmm24, %%zmm25, off0, off1, \ + %%zmm26, %%zmm27, %%zmm28, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) + +#define convKernel6x32c4(input, freg0, freg1, off0, off1, preg0, preg1, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) \ + convKernel6x32c4_3(input, %%zmm24, %%zmm25, off0, off1, \ + %%zmm26, %%zmm27, %%zmm28, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) + +#define convKernel1x32c4(input, freg0, freg1, off0, off1, preg0, preg1, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) \ + convKernel1x32c4_3(input, %%zmm24, %%zmm25, off0, off1, \ + %%zmm26, %%zmm27, %%zmm28, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11) #endif #define convKernelForLoopXx32(rnum, wsize) \ - __asm__ __volatile__("vmovups (%[filter]), %%zmm24 \n\t" \ - "vmovups 0x40(%[filter]), %%zmm25 \n\t" \ - "addq $0x80, %[filter] \n\t" \ - "mov $1, %%eax \n\t" \ - "vmovd %%eax, %%xmm0 \n\t" \ - "vpbroadcastw %%xmm0, %%zmm31 \n\t" \ - "movq %[flags], %%rax \n\t" \ - "andq $0x1, %%rax \n\t" \ - "jne 0f \n\t" \ - load32BiasTo##rnum##Regs(%[bias]) \ - "cmpq $0x10, %%rcx \n\t" \ - "jl 4f \n\t" \ - "jmp 1f \n\t" \ - ".align 16 \n\t" \ - "0: \n\t" \ - clear##rnum##Regs(%%zmm) \ - "cmpq $0x10, %%rcx \n\t" \ - "jl 4f \n\t" \ - ".align 16 \n\t" \ - "1: \n\t" \ - "movq %[input], %%rax \n\t" \ - convKernel##wsize##x32c4(%%rax, %%zmm24, %%zmm25, 0x0, 0x40, %%zmm26, %%zmm27) \ - "addq $0x4, %%rax \n\t" \ - convKernel##wsize##x32c4(%%rax, %%zmm26, %%zmm27, 0x80, 0xC0, %%zmm24, %%zmm25) \ - "addq $0x4, %%rax \n\t" \ - convKernel##wsize##x32c4(%%rax, %%zmm24, %%zmm25, 0x100, 0x140, %%zmm26, %%zmm27) \ - "addq $0x4, %%rax \n\t" \ - convKernel##wsize##x32c4(%%rax, %%zmm26, %%zmm27, 0x180, 0x1C0, %%zmm24, %%zmm25) \ - "addq $0x200, %[filter] \n\t" \ - "addq %[fStep], %[input] \n\t" \ - "subq $0x10, %%rcx \n\t" \ - "cmpq $0x10, %%rcx \n\t" \ - "jge 1b \n\t" \ - "subq %[fStep], %[input] \n\t" \ - "addq %[f8Step], %[input] \n\t" \ - ".align 16 \n\t" \ - "4: \n\t" \ - : "+c" (c.ic), [input] "+r" (c.input), [filter] "+r" (c.filter) \ - : [bias] "r" (c.bias), [stepC16] "r" (c.stepC16), [dilateW] "r" (c.dilateW), \ - [dilateH] "r" (c.dilateH), [fStep] "r" (c.fStep), [flags] "r" (c.flags), \ - [f8Step] "r" (c.f8Step) \ - : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \ - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", \ - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \ - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \ - "%zmm31", "memory", "cc"); \ - if (c.ic > 0) { \ - __asm__ __volatile__("cmpq $0x8, %%rcx \n\t" \ - "jl 2f \n\t" \ - "subq $0x8, %%rcx \n\t" \ - "shr $1, %[dilateW] \n\t" \ - "shr $1, %[dilateH] \n\t" \ - "shr $1, %[fStep] \n\t" \ - "movq %[input], %%rax \n\t" \ - convKernel##wsize##x32c4(%%rax, %%zmm24, %%zmm25, 0x0, 0x40, %%zmm26, %%zmm27) \ - "addq $0x4, %%rax \n\t" \ - convKernel##wsize##x32c4(%%rax, %%zmm26, %%zmm27, 0x80, 0xC0, %%zmm24, %%zmm25) \ + __asm__ __volatile__("vmovups (%[filter]), %%zmm24 \n\t" \ + "vmovups 0x40(%[filter]), %%zmm25 \n\t" \ + "addq $0x80, %[filter] \n\t" \ + "mov $1, %%eax \n\t" \ + "vmovd %%eax, %%xmm0 \n\t" \ + "vpbroadcastw %%xmm0, %%zmm31 \n\t" \ + "movq %[flags], %%rax \n\t" \ + "andq $0x1, %%rax \n\t" \ + "jne 0f \n\t" \ + load32BiasTo##rnum##Regs(%[bias]) \ + "cmpq $0x10, %%rcx \n\t" \ + "jl 4f \n\t" \ + "jmp 1f \n\t" \ + ".align 16 \n\t" \ + "0: \n\t" \ + clear##rnum##Regs(%%zmm) \ + "cmpq $0x10, %%rcx \n\t" \ + "jl 4f \n\t" \ + ".align 16 \n\t" \ + "1: \n\t" \ + "movq %[input], %%rax \n\t" \ + convKernel##wsize##x32c4(%%rax, %%zmm24, %%zmm25, 0x0, 0x40, \ + %%zmm26, %%zmm27, \ + 0x0, 0x10, 0x20, 0x30, 0x40, 0x50, \ + 0x60, 0x70, 0x80, 0x90, 0xA0, 0xB0) \ + "addq $0x4, %%rax \n\t" \ + convKernel##wsize##x32c4(%%rax, %%zmm26, %%zmm27, 0x80, 0xC0, \ + %%zmm24, %%zmm25, \ + 0x0, 0x10, 0x20, 0x30, 0x40, 0x50, \ + 0x60, 0x70, 0x80, 0x90, 0xA0, 0xB0) \ + "addq $0x4, %%rax \n\t" \ + convKernel##wsize##x32c4(%%rax, %%zmm24, %%zmm25, 0x100, 0x140, \ + %%zmm26, %%zmm27, \ + 0x0, 0x10, 0x20, 0x30, 0x40, 0x50, \ + 0x60, 0x70, 0x80, 0x90, 0xA0, 0xB0) \ + "addq $0x4, %%rax \n\t" \ + convKernel##wsize##x32c4(%%rax, %%zmm26, %%zmm27, 0x180, 0x1C0, \ + %%zmm24, %%zmm25, \ + 0x0, 0x10, 0x20, 0x30, 0x40, 0x50, \ + 0x60, 0x70, 0x80, 0x90, 0xA0, 0xB0) \ + "addq $0x200, %[filter] \n\t" \ + "addq %[fStep], %[input] \n\t" \ + "subq $0x10, %%rcx \n\t" \ + "cmpq $0x10, %%rcx \n\t" \ + "jge 1b \n\t" \ + "subq %[fStep], %[input] \n\t" \ + "addq %[f8Step], %[input] \n\t" \ + ".align 16 \n\t" \ + "4: \n\t" \ + : "+c" (c.ic), \ + [input] "+r" (c.input), \ + [filter] "+r" (c.filter) \ + : [bias] "r" (c.bias), \ + [dilateW] "r" (c.dilateW), \ + [dilateH] "r" (c.dilateH), \ + [fStep] "r" (c.fStep), \ + [flags] "r" (c.flags), \ + [f8Step] "r" (c.f8Step) \ + : "%rax", "%rbx", "%r9", \ + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", \ + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", \ + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \ + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \ + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", \ + "%zmm30", "%zmm31", "memory", "cc"); \ + if (c.ic > 0) { \ + __asm__ __volatile__("cmpq $0x8, %%rcx \n\t" \ + "jl 2f \n\t" \ + "subq $0x8, %%rcx \n\t" \ + "shr $1, %[dilateW] \n\t" \ + "shr $1, %[dilateH] \n\t" \ + "shr $1, %[fStep] \n\t" \ + "movq %[input], %%rax \n\t" \ + convKernel##wsize##x32c4(%%rax, %%zmm24, %%zmm25, 0x0, 0x40, \ + %%zmm26, %%zmm27, \ + 0x0, 0x8, 0x10, 0x18, 0x20, 0x28, 0x30, \ + 0x38, 0x40, 0x48, 0x50, 0x58) \ + "addq $0x4, %%rax \n\t" \ + convKernel##wsize##x32c4(%%rax, %%zmm26, %%zmm27, 0x80, 0xC0, \ + %%zmm24, %%zmm25, \ + 0x0, 0x8, 0x10, 0x18, 0x20, 0x28, 0x30, \ + 0x38, 0x40, 0x48, 0x50, 0x58) \ "addq $0x100, %[filter] \n\t" \ - "addq %[f4Step], %[input] \n\t" \ - ".align 16 \n\t" \ - "2: \n\t" \ - "cmpq $0x4, %%rcx \n\t" \ - "jl 5f \n\t" \ - "shr $1, %[dilateW] \n\t" \ - "shr $1, %[dilateH] \n\t" \ - convKernel##wsize##x32c4(%[input], %%zmm24, %%zmm25, 0x0, 0x40, %%zmm26, %%zmm27) \ - "addq $0x80, %[filter] \n\t" \ - ".align 16 \n\t" \ - "5: \n\t" \ - : "+c" (c.ic) \ - : [input] "r" (c.input), [filter] "r" (c.filter), [bias] "r" (c.bias), \ - [dilateW] "r" (c.dilateW), \ - [dilateH] "r" (c.dilateH), [fStep] "r" (c.fStep), \ - [f4Step] "r" (c.f4Step) \ - : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \ - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", \ - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \ - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \ - "%zmm31", "memory", "cc"); \ + "addq %[f4Step], %[input] \n\t" \ + ".align 16 \n\t" \ + "2: \n\t" \ + "cmpq $0x4, %%rcx \n\t" \ + "jl 5f \n\t" \ + "shr $1, %[dilateW] \n\t" \ + "shr $1, %[dilateH] \n\t" \ + convKernel##wsize##x32c4(%[input], %%zmm24, %%zmm25, 0x0, 0x40, \ + %%zmm26, %%zmm27, \ + 0x0, 0x4, 0x8, 0xC, 0x10, 0x14, \ + 0x18, 0x1C, 0x20, 0x24, 0x28, 0x2C) \ + "addq $0x80, %[filter] \n\t" \ + ".align 16 \n\t" \ + "5: \n\t" \ + : "+c" (c.ic) \ + : [input] "r" (c.input), \ + [filter] "r" (c.filter), \ + [bias] "r" (c.bias), \ + [dilateW] "r" (c.dilateW), \ + [dilateH] "r" (c.dilateH), \ + [fStep] "r" (c.fStep), \ + [f4Step] "r" (c.f4Step) \ + : "%rax", "%rbx", "%r9", \ + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", \ + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", \ + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \ + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \ + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", \ + "%zmm30", "%zmm31", "memory", "cc"); \ } void Avx512Conv1x1Kernel12x32(ConvController &c) { convKernelForLoopXx32(24, 12) - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" "je 0f \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd 0x40(%%rax), %%zmm2, %%zmm2 \n\t" - "vpaddd 0x80(%%rax), %%zmm4, %%zmm4 \n\t" - "vpaddd 0xC0(%%rax), %%zmm6, %%zmm6 \n\t" - "vpaddd 0x100(%%rax), %%zmm8, %%zmm8 \n\t" - "vpaddd 0x140(%%rax), %%zmm10, %%zmm10 \n\t" - "vpaddd 0x180(%%rax), %%zmm12, %%zmm12 \n\t" - "vpaddd 0x1C0(%%rax), %%zmm14, %%zmm14 \n\t" - "vpaddd 0x200(%%rax), %%zmm16, %%zmm16 \n\t" - "vpaddd 0x240(%%rax), %%zmm18, %%zmm18 \n\t" - "vpaddd 0x280(%%rax), %%zmm20, %%zmm20 \n\t" - "vpaddd 0x2C0(%%rax), %%zmm22, %%zmm22 \n\t" - "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" - "vpaddd 0x40(%%rax, %%rbx), %%zmm3, %%zmm3 \n\t" - "vpaddd 0x80(%%rax, %%rbx), %%zmm5, %%zmm5 \n\t" - "vpaddd 0xC0(%%rax, %%rbx), %%zmm7, %%zmm7 \n\t" - "vpaddd 0x100(%%rax, %%rbx), %%zmm9, %%zmm9 \n\t" - "vpaddd 0x140(%%rax, %%rbx), %%zmm11, %%zmm11 \n\t" - "vpaddd 0x180(%%rax, %%rbx), %%zmm13, %%zmm13 \n\t" - "vpaddd 0x1C0(%%rax, %%rbx), %%zmm15, %%zmm15 \n\t" - "vpaddd 0x200(%%rax, %%rbx), %%zmm17, %%zmm17 \n\t" - "vpaddd 0x240(%%rax, %%rbx), %%zmm19, %%zmm19 \n\t" - "vpaddd 0x280(%%rax, %%rbx), %%zmm21, %%zmm21 \n\t" - "vpaddd 0x2C0(%%rax, %%rbx), %%zmm23, %%zmm23 \n\t" + "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" + "vpaddd 0x40(%%rax), %%zmm2, %%zmm2 \n\t" + "vpaddd 0x80(%%rax), %%zmm4, %%zmm4 \n\t" + "vpaddd 0xC0(%%rax), %%zmm6, %%zmm6 \n\t" + "vpaddd 0x100(%%rax), %%zmm8, %%zmm8 \n\t" + "vpaddd 0x140(%%rax), %%zmm10, %%zmm10 \n\t" + "vpaddd 0x180(%%rax), %%zmm12, %%zmm12 \n\t" + "vpaddd 0x1C0(%%rax), %%zmm14, %%zmm14 \n\t" + "vpaddd 0x200(%%rax), %%zmm16, %%zmm16 \n\t" + "vpaddd 0x240(%%rax), %%zmm18, %%zmm18 \n\t" + "vpaddd 0x280(%%rax), %%zmm20, %%zmm20 \n\t" + "vpaddd 0x2C0(%%rax), %%zmm22, %%zmm22 \n\t" + "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" + "vpaddd 0x40(%%rax, %%rbx), %%zmm3, %%zmm3 \n\t" + "vpaddd 0x80(%%rax, %%rbx), %%zmm5, %%zmm5 \n\t" + "vpaddd 0xC0(%%rax, %%rbx), %%zmm7, %%zmm7 \n\t" + "vpaddd 0x100(%%rax, %%rbx), %%zmm9, %%zmm9 \n\t" + "vpaddd 0x140(%%rax, %%rbx), %%zmm11, %%zmm11 \n\t" + "vpaddd 0x180(%%rax, %%rbx), %%zmm13, %%zmm13 \n\t" + "vpaddd 0x1C0(%%rax, %%rbx), %%zmm15, %%zmm15 \n\t" + "vpaddd 0x200(%%rax, %%rbx), %%zmm17, %%zmm17 \n\t" + "vpaddd 0x240(%%rax, %%rbx), %%zmm19, %%zmm19 \n\t" + "vpaddd 0x280(%%rax, %%rbx), %%zmm21, %%zmm21 \n\t" + "vpaddd 0x2C0(%%rax, %%rbx), %%zmm23, %%zmm23 \n\t" ".align 16 \n\t" "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" relu24Regs(%%zmm) + "jmp 4f \n\t" ".align 16 \n\t" "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" convert24RegsI32ToF32(%[scale], %%zmm) ".align 16 \n\t" "2: \n\t" - "vmovups %%zmm0, (%%rax) \n\t" - "vmovups %%zmm2, 0x40(%%rax) \n\t" - "vmovups %%zmm4, 0x80(%%rax) \n\t" - "vmovups %%zmm6, 0xC0(%%rax) \n\t" - "vmovups %%zmm8, 0x100(%%rax) \n\t" - "vmovups %%zmm10, 0x140(%%rax) \n\t" - "vmovups %%zmm12, 0x180(%%rax) \n\t" - "vmovups %%zmm14, 0x1C0(%%rax) \n\t" - "vmovups %%zmm16, 0x200(%%rax) \n\t" - "vmovups %%zmm18, 0x240(%%rax) \n\t" - "vmovups %%zmm20, 0x280(%%rax) \n\t" - "vmovups %%zmm22, 0x2C0(%%rax) \n\t" - "vmovups %%zmm1, (%%rax, %%rbx) \n\t" - "vmovups %%zmm3, 0x40(%%rax, %%rbx) \n\t" - "vmovups %%zmm5, 0x80(%%rax, %%rbx) \n\t" - "vmovups %%zmm7, 0xC0(%%rax, %%rbx) \n\t" - "vmovups %%zmm9, 0x100(%%rax, %%rbx) \n\t" - "vmovups %%zmm11, 0x140(%%rax, %%rbx) \n\t" - "vmovups %%zmm13, 0x180(%%rax, %%rbx) \n\t" - "vmovups %%zmm15, 0x1C0(%%rax, %%rbx) \n\t" - "vmovups %%zmm17, 0x200(%%rax, %%rbx) \n\t" - "vmovups %%zmm19, 0x240(%%rax, %%rbx) \n\t" - "vmovups %%zmm21, 0x280(%%rax, %%rbx) \n\t" - "vmovups %%zmm23, 0x2C0(%%rax, %%rbx) \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + "vaddps 0x40(%[eltwise]), %%zmm2, %%zmm2 \n\t" + "vaddps 0x80(%[eltwise]), %%zmm4, %%zmm4 \n\t" + "vaddps 0xC0(%[eltwise]), %%zmm6, %%zmm6 \n\t" + "vaddps 0x100(%[eltwise]), %%zmm8, %%zmm8 \n\t" + "vaddps 0x140(%[eltwise]), %%zmm10, %%zmm10 \n\t" + "vaddps 0x180(%[eltwise]), %%zmm12, %%zmm12 \n\t" + "vaddps 0x1C0(%[eltwise]), %%zmm14, %%zmm14 \n\t" + "vaddps 0x200(%[eltwise]), %%zmm16, %%zmm16 \n\t" + "vaddps 0x240(%[eltwise]), %%zmm18, %%zmm18 \n\t" + "vaddps 0x280(%[eltwise]), %%zmm20, %%zmm20 \n\t" + "vaddps 0x2C0(%[eltwise]), %%zmm22, %%zmm22 \n\t" + "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1 \n\t" + "vaddps 0x40(%[eltwise], %%rbx), %%zmm3, %%zmm3 \n\t" + "vaddps 0x80(%[eltwise], %%rbx), %%zmm5, %%zmm5 \n\t" + "vaddps 0xC0(%[eltwise], %%rbx), %%zmm7, %%zmm7 \n\t" + "vaddps 0x100(%[eltwise], %%rbx), %%zmm9, %%zmm9 \n\t" + "vaddps 0x140(%[eltwise], %%rbx), %%zmm11, %%zmm11 \n\t" + "vaddps 0x180(%[eltwise], %%rbx), %%zmm13, %%zmm13 \n\t" + "vaddps 0x1C0(%[eltwise], %%rbx), %%zmm15, %%zmm15 \n\t" + "vaddps 0x200(%[eltwise], %%rbx), %%zmm17, %%zmm17 \n\t" + "vaddps 0x240(%[eltwise], %%rbx), %%zmm19, %%zmm19 \n\t" + "vaddps 0x280(%[eltwise], %%rbx), %%zmm21, %%zmm21 \n\t" + "vaddps 0x2C0(%[eltwise], %%rbx), %%zmm23, %%zmm23 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + relu24RegsPs(%%zmm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%zmm0, (%%rax) \n\t" + "vmovups %%zmm2, 0x40(%%rax) \n\t" + "vmovups %%zmm4, 0x80(%%rax) \n\t" + "vmovups %%zmm6, 0xC0(%%rax) \n\t" + "vmovups %%zmm8, 0x100(%%rax) \n\t" + "vmovups %%zmm10, 0x140(%%rax) \n\t" + "vmovups %%zmm12, 0x180(%%rax) \n\t" + "vmovups %%zmm14, 0x1C0(%%rax) \n\t" + "vmovups %%zmm16, 0x200(%%rax) \n\t" + "vmovups %%zmm18, 0x240(%%rax) \n\t" + "vmovups %%zmm20, 0x280(%%rax) \n\t" + "vmovups %%zmm22, 0x2C0(%%rax) \n\t" + "vmovups %%zmm1, (%%rax, %%rbx) \n\t" + "vmovups %%zmm3, 0x40(%%rax, %%rbx) \n\t" + "vmovups %%zmm5, 0x80(%%rax, %%rbx) \n\t" + "vmovups %%zmm7, 0xC0(%%rax, %%rbx) \n\t" + "vmovups %%zmm9, 0x100(%%rax, %%rbx) \n\t" + "vmovups %%zmm11, 0x140(%%rax, %%rbx) \n\t" + "vmovups %%zmm13, 0x180(%%rax, %%rbx) \n\t" + "vmovups %%zmm15, 0x1C0(%%rax, %%rbx) \n\t" + "vmovups %%zmm17, 0x200(%%rax, %%rbx) \n\t" + "vmovups %%zmm19, 0x240(%%rax, %%rbx) \n\t" + "vmovups %%zmm21, 0x280(%%rax, %%rbx) \n\t" + "vmovups %%zmm23, 0x2C0(%%rax, %%rbx) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [ostepC16] "r" (c.ostepC16), + [eltwise] "r" (c.eltwise), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", + "%zmm30", "%zmm31", "memory", "cc"); } void Avx512Conv1x1Kernel6x32(ConvController &c) { convKernelForLoopXx32(12, 6) - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" "je 0f \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd 0x40(%%rax), %%zmm2, %%zmm2 \n\t" - "vpaddd 0x80(%%rax), %%zmm4, %%zmm4 \n\t" - "vpaddd 0xC0(%%rax), %%zmm6, %%zmm6 \n\t" - "vpaddd 0x100(%%rax), %%zmm8, %%zmm8 \n\t" - "vpaddd 0x140(%%rax), %%zmm10, %%zmm10 \n\t" - "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" - "vpaddd 0x40(%%rax, %%rbx), %%zmm3, %%zmm3 \n\t" - "vpaddd 0x80(%%rax, %%rbx), %%zmm5, %%zmm5 \n\t" - "vpaddd 0xC0(%%rax, %%rbx), %%zmm7, %%zmm7 \n\t" - "vpaddd 0x100(%%rax, %%rbx), %%zmm9, %%zmm9 \n\t" - "vpaddd 0x140(%%rax, %%rbx), %%zmm11, %%zmm11 \n\t" + "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" + "vpaddd 0x40(%%rax), %%zmm2, %%zmm2 \n\t" + "vpaddd 0x80(%%rax), %%zmm4, %%zmm4 \n\t" + "vpaddd 0xC0(%%rax), %%zmm6, %%zmm6 \n\t" + "vpaddd 0x100(%%rax), %%zmm8, %%zmm8 \n\t" + "vpaddd 0x140(%%rax), %%zmm10, %%zmm10 \n\t" + "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" + "vpaddd 0x40(%%rax, %%rbx), %%zmm3, %%zmm3 \n\t" + "vpaddd 0x80(%%rax, %%rbx), %%zmm5, %%zmm5 \n\t" + "vpaddd 0xC0(%%rax, %%rbx), %%zmm7, %%zmm7 \n\t" + "vpaddd 0x100(%%rax, %%rbx), %%zmm9, %%zmm9 \n\t" + "vpaddd 0x140(%%rax, %%rbx), %%zmm11, %%zmm11 \n\t" ".align 16 \n\t" "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" relu12Regs(%%zmm) + "jmp 4f \n\t" ".align 16 \n\t" "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" convert12RegsI32ToF32(%[scale], %%zmm) ".align 16 \n\t" "2: \n\t" - "vmovups %%zmm0, (%%rax) \n\t" - "vmovups %%zmm2, 0x40(%%rax) \n\t" - "vmovups %%zmm4, 0x80(%%rax) \n\t" - "vmovups %%zmm6, 0xC0(%%rax) \n\t" - "vmovups %%zmm8, 0x100(%%rax) \n\t" - "vmovups %%zmm10, 0x140(%%rax) \n\t" - "vmovups %%zmm1, (%%rax, %%rbx) \n\t" - "vmovups %%zmm3, 0x40(%%rax, %%rbx) \n\t" - "vmovups %%zmm5, 0x80(%%rax, %%rbx) \n\t" - "vmovups %%zmm7, 0xC0(%%rax, %%rbx) \n\t" - "vmovups %%zmm9, 0x100(%%rax, %%rbx) \n\t" - "vmovups %%zmm11, 0x140(%%rax, %%rbx) \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + "vaddps 0x40(%[eltwise]), %%zmm2, %%zmm2 \n\t" + "vaddps 0x80(%[eltwise]), %%zmm4, %%zmm4 \n\t" + "vaddps 0xC0(%[eltwise]), %%zmm6, %%zmm6 \n\t" + "vaddps 0x100(%[eltwise]), %%zmm8, %%zmm8 \n\t" + "vaddps 0x140(%[eltwise]), %%zmm10, %%zmm10 \n\t" + "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1 \n\t" + "vaddps 0x40(%[eltwise], %%rbx), %%zmm3, %%zmm3 \n\t" + "vaddps 0x80(%[eltwise], %%rbx), %%zmm5, %%zmm5 \n\t" + "vaddps 0xC0(%[eltwise], %%rbx), %%zmm7, %%zmm7 \n\t" + "vaddps 0x100(%[eltwise], %%rbx), %%zmm9, %%zmm9 \n\t" + "vaddps 0x140(%[eltwise], %%rbx), %%zmm11, %%zmm11 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + relu12RegsPs(%%zmm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%zmm0, (%%rax) \n\t" + "vmovups %%zmm2, 0x40(%%rax) \n\t" + "vmovups %%zmm4, 0x80(%%rax) \n\t" + "vmovups %%zmm6, 0xC0(%%rax) \n\t" + "vmovups %%zmm8, 0x100(%%rax) \n\t" + "vmovups %%zmm10, 0x140(%%rax) \n\t" + "vmovups %%zmm1, (%%rax, %%rbx) \n\t" + "vmovups %%zmm3, 0x40(%%rax, %%rbx) \n\t" + "vmovups %%zmm5, 0x80(%%rax, %%rbx) \n\t" + "vmovups %%zmm7, 0xC0(%%rax, %%rbx) \n\t" + "vmovups %%zmm9, 0x100(%%rax, %%rbx) \n\t" + "vmovups %%zmm11, 0x140(%%rax, %%rbx) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [ostepC16] "r" (c.ostepC16), + [eltwise] "r" (c.eltwise), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", + "%zmm24", "%zmm31", "memory", "cc"); } void Avx512Conv1x1Kernel1x32(ConvController &c) { convKernelForLoopXx32(2, 1) - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" - "je 0f \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" + "je 0f \n\t" + "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" + "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" - ".align 16 \n\t" - "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" relu2Regs(%%zmm) + "jmp 4f \n\t" - ".align 16 \n\t" - "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" + ".align 16 \n\t" + "1: \n\t" convert2RegsI32ToF32(%[scale], %%zmm) - ".align 16 \n\t" - "2: \n\t" - "vmovups %%zmm0, (%%rax) \n\t" - "vmovups %%zmm1, (%%rax, %%rbx) \n\t" + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + relu2RegsPs(%%zmm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%zmm0, (%%rax) \n\t" + "vmovups %%zmm1, (%%rax, %%rbx) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [ostepC16] "r" (c.ostepC16), + [eltwise] "r" (c.eltwise), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%zmm0", "%zmm1", "%zmm24", "%zmm31", "memory", "cc"); } -#define load16BiasTo1Regs(bias, rtype) \ - "vmovups ("#bias"), "#rtype"0 \n\t" - -#define load16BiasTo12Regs(bias, rtype) \ - load16BiasTo1Regs(bias, rtype) \ - "vmovups "#rtype"0, "#rtype"1 \n\t" \ - "vmovups "#rtype"0, "#rtype"2 \n\t" \ - "vmovups "#rtype"0, "#rtype"3 \n\t" \ - "vmovups "#rtype"0, "#rtype"4 \n\t" \ - "vmovups "#rtype"0, "#rtype"5 \n\t" \ - "vmovups "#rtype"0, "#rtype"6 \n\t" \ - "vmovups "#rtype"0, "#rtype"7 \n\t" \ - "vmovups "#rtype"0, "#rtype"8 \n\t" \ - "vmovups "#rtype"0, "#rtype"9 \n\t" \ - "vmovups "#rtype"0, "#rtype"10 \n\t" \ - "vmovups "#rtype"0, "#rtype"11 \n\t" - -#define load16BiasTo24Regs(bias, rtype) \ - load16BiasTo12Regs(bias, rtype) \ - "vmovups "#rtype"0, "#rtype"12 \n\t" \ - "vmovups "#rtype"0, "#rtype"13 \n\t" \ - "vmovups "#rtype"0, "#rtype"14 \n\t" \ - "vmovups "#rtype"0, "#rtype"15 \n\t" \ - "vmovups "#rtype"0, "#rtype"16 \n\t" \ - "vmovups "#rtype"0, "#rtype"17 \n\t" \ - "vmovups "#rtype"0, "#rtype"18 \n\t" \ - "vmovups "#rtype"0, "#rtype"19 \n\t" \ - "vmovups "#rtype"0, "#rtype"20 \n\t" \ - "vmovups "#rtype"0, "#rtype"21 \n\t" \ - "vmovups "#rtype"0, "#rtype"22 \n\t" \ - "vmovups "#rtype"0, "#rtype"23 \n\t" - #ifdef _USE_AVX512_VNNI -#define convKernel24x16c4(input, freg0, off0, preg0, rtype) \ - "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd 0x10("#input"), "#rtype"27 \n\t" \ - "vpbroadcastd 0x20("#input"), "#rtype"28 \n\t" \ - "vpbroadcastd 0x30("#input"), "#rtype"29 \n\t" \ - "vpbroadcastd 0x40("#input"), "#rtype"30 \n\t" \ - "vpbroadcastd 0x50("#input"), "#rtype"31 \n\t" \ - "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ - "vpdpbusd "#freg0", "#rtype"26, "#rtype"0 \n\t" \ - "vpdpbusd "#freg0", "#rtype"27, "#rtype"1 \n\t" \ - "vpdpbusd "#freg0", "#rtype"28, "#rtype"2 \n\t" \ - "vpdpbusd "#freg0", "#rtype"29, "#rtype"3 \n\t" \ - "vpdpbusd "#freg0", "#rtype"30, "#rtype"4 \n\t" \ - "vpdpbusd "#freg0", "#rtype"31, "#rtype"5 \n\t" \ - "vpbroadcastd 0x60("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd 0x70("#input"), "#rtype"27 \n\t" \ - "vpbroadcastd 0x80("#input"), "#rtype"28 \n\t" \ - "vpbroadcastd 0x90("#input"), "#rtype"29 \n\t" \ - "vpbroadcastd 0xA0("#input"), "#rtype"30 \n\t" \ - "vpbroadcastd 0xB0("#input"), "#rtype"31 \n\t" \ - "vpdpbusd "#freg0", "#rtype"26, "#rtype"6 \n\t" \ - "vpdpbusd "#freg0", "#rtype"27, "#rtype"7 \n\t" \ - "vpdpbusd "#freg0", "#rtype"28, "#rtype"8 \n\t" \ - "vpdpbusd "#freg0", "#rtype"29, "#rtype"9 \n\t" \ - "vpdpbusd "#freg0", "#rtype"30, "#rtype"10 \n\t" \ - "vpdpbusd "#freg0", "#rtype"31, "#rtype"11 \n\t" \ - "vpbroadcastd 0xC0("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd 0xD0("#input"), "#rtype"27 \n\t" \ - "vpbroadcastd 0xE0("#input"), "#rtype"28 \n\t" \ - "vpbroadcastd 0xF0("#input"), "#rtype"29 \n\t" \ - "vpbroadcastd 0x100("#input"), "#rtype"30 \n\t" \ - "vpbroadcastd 0x110("#input"), "#rtype"31 \n\t" \ - "vpdpbusd "#freg0", "#rtype"26, "#rtype"12 \n\t" \ - "vpdpbusd "#freg0", "#rtype"27, "#rtype"13 \n\t" \ - "vpdpbusd "#freg0", "#rtype"28, "#rtype"14 \n\t" \ - "vpdpbusd "#freg0", "#rtype"29, "#rtype"15 \n\t" \ - "vpdpbusd "#freg0", "#rtype"30, "#rtype"16 \n\t" \ - "vpdpbusd "#freg0", "#rtype"31, "#rtype"17 \n\t" \ - "vpbroadcastd 0x120("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd 0x130("#input"), "#rtype"27 \n\t" \ - "vpbroadcastd 0x140("#input"), "#rtype"28 \n\t" \ - "vpbroadcastd 0x150("#input"), "#rtype"29 \n\t" \ - "vpbroadcastd 0x160("#input"), "#rtype"30 \n\t" \ - "vpbroadcastd 0x170("#input"), "#rtype"31 \n\t" \ - "vpdpbusd "#freg0", "#rtype"26, "#rtype"18 \n\t" \ - "vpdpbusd "#freg0", "#rtype"27, "#rtype"19 \n\t" \ - "vpdpbusd "#freg0", "#rtype"28, "#rtype"20 \n\t" \ - "vpdpbusd "#freg0", "#rtype"29, "#rtype"21 \n\t" \ - "vpdpbusd "#freg0", "#rtype"30, "#rtype"22 \n\t" \ - "vpdpbusd "#freg0", "#rtype"31, "#rtype"23 \n\t" - -#define convKernel12x16c4(input, freg0, off0, preg0, rtype) \ - "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd 0x10("#input"), "#rtype"27 \n\t" \ - "vpbroadcastd 0x20("#input"), "#rtype"28 \n\t" \ - "vpbroadcastd 0x30("#input"), "#rtype"29 \n\t" \ - "vpbroadcastd 0x40("#input"), "#rtype"30 \n\t" \ - "vpbroadcastd 0x50("#input"), "#rtype"31 \n\t" \ - "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ - "vpdpbusd "#freg0", "#rtype"26, "#rtype"0 \n\t" \ - "vpdpbusd "#freg0", "#rtype"27, "#rtype"1 \n\t" \ - "vpdpbusd "#freg0", "#rtype"28, "#rtype"2 \n\t" \ - "vpdpbusd "#freg0", "#rtype"29, "#rtype"3 \n\t" \ - "vpdpbusd "#freg0", "#rtype"30, "#rtype"4 \n\t" \ - "vpdpbusd "#freg0", "#rtype"31, "#rtype"5 \n\t" \ - "vpbroadcastd 0x60("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd 0x70("#input"), "#rtype"27 \n\t" \ - "vpbroadcastd 0x80("#input"), "#rtype"28 \n\t" \ - "vpbroadcastd 0x90("#input"), "#rtype"29 \n\t" \ - "vpbroadcastd 0xA0("#input"), "#rtype"30 \n\t" \ - "vpbroadcastd 0xB0("#input"), "#rtype"31 \n\t" \ - "vpdpbusd "#freg0", "#rtype"26, "#rtype"6 \n\t" \ - "vpdpbusd "#freg0", "#rtype"27, "#rtype"7 \n\t" \ - "vpdpbusd "#freg0", "#rtype"28, "#rtype"8 \n\t" \ - "vpdpbusd "#freg0", "#rtype"29, "#rtype"9 \n\t" \ - "vpdpbusd "#freg0", "#rtype"30, "#rtype"10 \n\t" \ - "vpdpbusd "#freg0", "#rtype"31, "#rtype"11 \n\t" - -#define convKernel1x16c4(input, freg0, off0, preg0, rtype) \ - "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ - "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ - "vpdpbusd "#freg0", "#rtype"26, "#rtype"0 \n\t" +#define convKernel24x16c4(input, freg0, off0, preg0, rtype, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \ + i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) \ + "vpbroadcastd "#i0"("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd "#i1"("#input"), "#rtype"27 \n\t" \ + "vpbroadcastd "#i2"("#input"), "#rtype"28 \n\t" \ + "vpbroadcastd "#i3"("#input"), "#rtype"29 \n\t" \ + "vpbroadcastd "#i4"("#input"), "#rtype"30 \n\t" \ + "vpbroadcastd "#i5"("#input"), "#rtype"31 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"0 \n\t" \ + "vpdpbusd "#freg0", "#rtype"27, "#rtype"1 \n\t" \ + "vpdpbusd "#freg0", "#rtype"28, "#rtype"2 \n\t" \ + "vpdpbusd "#freg0", "#rtype"29, "#rtype"3 \n\t" \ + "vpdpbusd "#freg0", "#rtype"30, "#rtype"4 \n\t" \ + "vpdpbusd "#freg0", "#rtype"31, "#rtype"5 \n\t" \ + "vpbroadcastd "#i6"("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd "#i7"("#input"), "#rtype"27 \n\t" \ + "vpbroadcastd "#i8"("#input"), "#rtype"28 \n\t" \ + "vpbroadcastd "#i9"("#input"), "#rtype"29 \n\t" \ + "vpbroadcastd "#i10"("#input"), "#rtype"30 \n\t" \ + "vpbroadcastd "#i11"("#input"), "#rtype"31 \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"6 \n\t" \ + "vpdpbusd "#freg0", "#rtype"27, "#rtype"7 \n\t" \ + "vpdpbusd "#freg0", "#rtype"28, "#rtype"8 \n\t" \ + "vpdpbusd "#freg0", "#rtype"29, "#rtype"9 \n\t" \ + "vpdpbusd "#freg0", "#rtype"30, "#rtype"10 \n\t" \ + "vpdpbusd "#freg0", "#rtype"31, "#rtype"11 \n\t" \ + "vpbroadcastd "#i12"("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd "#i13"("#input"), "#rtype"27 \n\t" \ + "vpbroadcastd "#i14"("#input"), "#rtype"28 \n\t" \ + "vpbroadcastd "#i15"("#input"), "#rtype"29 \n\t" \ + "vpbroadcastd "#i16"("#input"), "#rtype"30 \n\t" \ + "vpbroadcastd "#i17"("#input"), "#rtype"31 \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"12 \n\t" \ + "vpdpbusd "#freg0", "#rtype"27, "#rtype"13 \n\t" \ + "vpdpbusd "#freg0", "#rtype"28, "#rtype"14 \n\t" \ + "vpdpbusd "#freg0", "#rtype"29, "#rtype"15 \n\t" \ + "vpdpbusd "#freg0", "#rtype"30, "#rtype"16 \n\t" \ + "vpdpbusd "#freg0", "#rtype"31, "#rtype"17 \n\t" \ + "vpbroadcastd "#i18"("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd "#i19"("#input"), "#rtype"27 \n\t" \ + "vpbroadcastd "#i20"("#input"), "#rtype"28 \n\t" \ + "vpbroadcastd "#i21"("#input"), "#rtype"29 \n\t" \ + "vpbroadcastd "#i22"("#input"), "#rtype"30 \n\t" \ + "vpbroadcastd "#i23"("#input"), "#rtype"31 \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"18 \n\t" \ + "vpdpbusd "#freg0", "#rtype"27, "#rtype"19 \n\t" \ + "vpdpbusd "#freg0", "#rtype"28, "#rtype"20 \n\t" \ + "vpdpbusd "#freg0", "#rtype"29, "#rtype"21 \n\t" \ + "vpdpbusd "#freg0", "#rtype"30, "#rtype"22 \n\t" \ + "vpdpbusd "#freg0", "#rtype"31, "#rtype"23 \n\t" + +#define convKernel12x16c4(input, freg0, off0, preg0, rtype, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \ + i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) \ + "vpbroadcastd "#i0"("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd "#i1"("#input"), "#rtype"27 \n\t" \ + "vpbroadcastd "#i2"("#input"), "#rtype"28 \n\t" \ + "vpbroadcastd "#i3"("#input"), "#rtype"29 \n\t" \ + "vpbroadcastd "#i4"("#input"), "#rtype"30 \n\t" \ + "vpbroadcastd "#i5"("#input"), "#rtype"31 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"0 \n\t" \ + "vpdpbusd "#freg0", "#rtype"27, "#rtype"1 \n\t" \ + "vpdpbusd "#freg0", "#rtype"28, "#rtype"2 \n\t" \ + "vpdpbusd "#freg0", "#rtype"29, "#rtype"3 \n\t" \ + "vpdpbusd "#freg0", "#rtype"30, "#rtype"4 \n\t" \ + "vpdpbusd "#freg0", "#rtype"31, "#rtype"5 \n\t" \ + "vpbroadcastd "#i6"("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd "#i7"("#input"), "#rtype"27 \n\t" \ + "vpbroadcastd "#i8"("#input"), "#rtype"28 \n\t" \ + "vpbroadcastd "#i9"("#input"), "#rtype"29 \n\t" \ + "vpbroadcastd "#i10"("#input"), "#rtype"30 \n\t" \ + "vpbroadcastd "#i11"("#input"), "#rtype"31 \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"6 \n\t" \ + "vpdpbusd "#freg0", "#rtype"27, "#rtype"7 \n\t" \ + "vpdpbusd "#freg0", "#rtype"28, "#rtype"8 \n\t" \ + "vpdpbusd "#freg0", "#rtype"29, "#rtype"9 \n\t" \ + "vpdpbusd "#freg0", "#rtype"30, "#rtype"10 \n\t" \ + "vpdpbusd "#freg0", "#rtype"31, "#rtype"11 \n\t" + +#define convKernel1x16c4(input, freg0, off0, preg0, rtype, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \ + i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"0 \n\t" #else -#define convKernel24x16c4_3(input, freg0, off0, preg0, rtype) \ - "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd 0x10("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd 0x20("#input"), "#rtype"27 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vpbroadcastd 0x30("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd 0x40("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd 0x50("#input"), "#rtype"27 \n\t" \ - "vpaddd "#rtype"0, "#rtype"28, "#rtype"0 \n\t" \ - "vpaddd "#rtype"1, "#rtype"29, "#rtype"1 \n\t" \ - "vpaddd "#rtype"2, "#rtype"30, "#rtype"2 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vpbroadcastd 0x60("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd 0x70("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd 0x80("#input"), "#rtype"27 \n\t" \ - "vpaddd "#rtype"3, "#rtype"28, "#rtype"3 \n\t" \ - "vpaddd "#rtype"4, "#rtype"29, "#rtype"4 \n\t" \ - "vpaddd "#rtype"5, "#rtype"30, "#rtype"5 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vpbroadcastd 0x90("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd 0xA0("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd 0xB0("#input"), "#rtype"27 \n\t" \ - "vpaddd "#rtype"6, "#rtype"28, "#rtype"6 \n\t" \ - "vpaddd "#rtype"7, "#rtype"29, "#rtype"7 \n\t" \ - "vpaddd "#rtype"8, "#rtype"30, "#rtype"8 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vpbroadcastd 0xC0("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd 0xD0("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd 0xE0("#input"), "#rtype"27 \n\t" \ - "vpaddd "#rtype"9, "#rtype"28, "#rtype"9 \n\t" \ - "vpaddd "#rtype"10, "#rtype"29, "#rtype"10 \n\t" \ - "vpaddd "#rtype"11, "#rtype"30, "#rtype"11 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vpbroadcastd 0xF0("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd 0x100("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd 0x110("#input"), "#rtype"27 \n\t" \ - "vpaddd "#rtype"12, "#rtype"28, "#rtype"12 \n\t" \ - "vpaddd "#rtype"13, "#rtype"29, "#rtype"13 \n\t" \ - "vpaddd "#rtype"14, "#rtype"30, "#rtype"14 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vpbroadcastd 0x120("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd 0x130("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd 0x140("#input"), "#rtype"27 \n\t" \ - "vpaddd "#rtype"15, "#rtype"28, "#rtype"15 \n\t" \ - "vpaddd "#rtype"16, "#rtype"29, "#rtype"16 \n\t" \ - "vpaddd "#rtype"17, "#rtype"30, "#rtype"17 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vpbroadcastd 0x150("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd 0x160("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd 0x170("#input"), "#rtype"27 \n\t" \ - "vpaddd "#rtype"18, "#rtype"28, "#rtype"18 \n\t" \ - "vpaddd "#rtype"19, "#rtype"29, "#rtype"19 \n\t" \ - "vpaddd "#rtype"20, "#rtype"30, "#rtype"20 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ - "vpaddd "#rtype"21, "#rtype"28, "#rtype"21 \n\t" \ - "vpaddd "#rtype"22, "#rtype"29, "#rtype"22 \n\t" \ - "vpaddd "#rtype"23, "#rtype"30, "#rtype"23 \n\t" - -#define convKernel12x16c4_3(input, freg0, off0, preg0, rtype) \ - "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd 0x10("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd 0x20("#input"), "#rtype"27 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vpbroadcastd 0x30("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd 0x40("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd 0x50("#input"), "#rtype"27 \n\t" \ - "vpaddd "#rtype"0, "#rtype"28, "#rtype"0 \n\t" \ - "vpaddd "#rtype"1, "#rtype"29, "#rtype"1 \n\t" \ - "vpaddd "#rtype"2, "#rtype"30, "#rtype"2 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vpbroadcastd 0x60("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd 0x70("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd 0x80("#input"), "#rtype"27 \n\t" \ - "vpaddd "#rtype"3, "#rtype"28, "#rtype"3 \n\t" \ - "vpaddd "#rtype"4, "#rtype"29, "#rtype"4 \n\t" \ - "vpaddd "#rtype"5, "#rtype"30, "#rtype"5 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vpbroadcastd 0x90("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd 0xA0("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd 0xB0("#input"), "#rtype"27 \n\t" \ - "vpaddd "#rtype"6, "#rtype"28, "#rtype"6 \n\t" \ - "vpaddd "#rtype"7, "#rtype"29, "#rtype"7 \n\t" \ - "vpaddd "#rtype"8, "#rtype"30, "#rtype"8 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ - "vpaddd "#rtype"9, "#rtype"28, "#rtype"9 \n\t" \ - "vpaddd "#rtype"10, "#rtype"29, "#rtype"10 \n\t" \ - "vpaddd "#rtype"11, "#rtype"30, "#rtype"11 \n\t" - -#define convKernel1x16c4_3(input, freg0, off0, preg0, rtype) \ - "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ - "vpaddd "#rtype"0, "#rtype"28, "#rtype"0 \n\t" - -#define convKernel24x16c4(input, freg0, off0, preg0, rtype) \ - convKernel24x16c4_3(input, rtype##24, off0, rtype##25, rtype) - -#define convKernel12x16c4(input, freg0, off0, preg0, rtype) \ - convKernel12x16c4_3(input, rtype##24, off0, rtype##25, rtype) - -#define convKernel1x16c4(input, freg0, off0, preg0, rtype) \ - convKernel1x16c4_3(input, rtype##24, off0, rtype##25, rtype) +#define convKernel24x16c4_3(input, freg0, off0, preg0, rtype, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \ + i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) \ + "vpbroadcastd "#i0"("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd "#i1"("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd "#i2"("#input"), "#rtype"27 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd "#i3"("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd "#i4"("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd "#i5"("#input"), "#rtype"27 \n\t" \ + "vpaddd "#rtype"0, "#rtype"28, "#rtype"0 \n\t" \ + "vpaddd "#rtype"1, "#rtype"29, "#rtype"1 \n\t" \ + "vpaddd "#rtype"2, "#rtype"30, "#rtype"2 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd "#i6"("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd "#i7"("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd "#i8"("#input"), "#rtype"27 \n\t" \ + "vpaddd "#rtype"3, "#rtype"28, "#rtype"3 \n\t" \ + "vpaddd "#rtype"4, "#rtype"29, "#rtype"4 \n\t" \ + "vpaddd "#rtype"5, "#rtype"30, "#rtype"5 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd "#i9"("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd "#i10"("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd "#i11"("#input"), "#rtype"27 \n\t" \ + "vpaddd "#rtype"6, "#rtype"28, "#rtype"6 \n\t" \ + "vpaddd "#rtype"7, "#rtype"29, "#rtype"7 \n\t" \ + "vpaddd "#rtype"8, "#rtype"30, "#rtype"8 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd "#i12"("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd "#i13"("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd "#i14"("#input"), "#rtype"27 \n\t" \ + "vpaddd "#rtype"9, "#rtype"28, "#rtype"9 \n\t" \ + "vpaddd "#rtype"10, "#rtype"29, "#rtype"10 \n\t" \ + "vpaddd "#rtype"11, "#rtype"30, "#rtype"11 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd "#i15"("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd "#i16"("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd "#i17"("#input"), "#rtype"27 \n\t" \ + "vpaddd "#rtype"12, "#rtype"28, "#rtype"12 \n\t" \ + "vpaddd "#rtype"13, "#rtype"29, "#rtype"13 \n\t" \ + "vpaddd "#rtype"14, "#rtype"30, "#rtype"14 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd "#i18"("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd "#i19"("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd "#i20"("#input"), "#rtype"27 \n\t" \ + "vpaddd "#rtype"15, "#rtype"28, "#rtype"15 \n\t" \ + "vpaddd "#rtype"16, "#rtype"29, "#rtype"16 \n\t" \ + "vpaddd "#rtype"17, "#rtype"30, "#rtype"17 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd "#i21"("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd "#i22"("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd "#i23"("#input"), "#rtype"27 \n\t" \ + "vpaddd "#rtype"18, "#rtype"28, "#rtype"18 \n\t" \ + "vpaddd "#rtype"19, "#rtype"29, "#rtype"19 \n\t" \ + "vpaddd "#rtype"20, "#rtype"30, "#rtype"20 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpaddd "#rtype"21, "#rtype"28, "#rtype"21 \n\t" \ + "vpaddd "#rtype"22, "#rtype"29, "#rtype"22 \n\t" \ + "vpaddd "#rtype"23, "#rtype"30, "#rtype"23 \n\t" + +#define convKernel12x16c4_3(input, freg0, off0, preg0, rtype, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \ + i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) \ + "vpbroadcastd "#i0"("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd "#i1"("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd "#i2"("#input"), "#rtype"27 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd "#i3"("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd "#i4"("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd "#i5"("#input"), "#rtype"27 \n\t" \ + "vpaddd "#rtype"0, "#rtype"28, "#rtype"0 \n\t" \ + "vpaddd "#rtype"1, "#rtype"29, "#rtype"1 \n\t" \ + "vpaddd "#rtype"2, "#rtype"30, "#rtype"2 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd "#i6"("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd "#i7"("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd "#i8"("#input"), "#rtype"27 \n\t" \ + "vpaddd "#rtype"3, "#rtype"28, "#rtype"3 \n\t" \ + "vpaddd "#rtype"4, "#rtype"29, "#rtype"4 \n\t" \ + "vpaddd "#rtype"5, "#rtype"30, "#rtype"5 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd "#i9"("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd "#i10"("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd "#i11"("#input"), "#rtype"27 \n\t" \ + "vpaddd "#rtype"6, "#rtype"28, "#rtype"6 \n\t" \ + "vpaddd "#rtype"7, "#rtype"29, "#rtype"7 \n\t" \ + "vpaddd "#rtype"8, "#rtype"30, "#rtype"8 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpaddd "#rtype"9, "#rtype"28, "#rtype"9 \n\t" \ + "vpaddd "#rtype"10, "#rtype"29, "#rtype"10 \n\t" \ + "vpaddd "#rtype"11, "#rtype"30, "#rtype"11 \n\t" + +#define convKernel1x16c4_3(input, freg0, off0, preg0, rtype, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \ + i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) \ + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpaddd "#rtype"0, "#rtype"28, "#rtype"0 \n\t" + +#define convKernel24x16c4(input, freg0, off0, preg0, rtype, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \ + i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) \ + convKernel24x16c4_3(input, rtype##24, off0, rtype##25, rtype, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \ + i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) + +#define convKernel12x16c4(input, freg0, off0, preg0, rtype, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \ + i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) \ + convKernel12x16c4_3(input, rtype##24, off0, rtype##25, rtype, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \ + i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) + +#define convKernel1x16c4(input, freg0, off0, preg0, rtype, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \ + i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) \ + convKernel1x16c4_3(input, rtype##24, off0, rtype##25, rtype, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, \ + i12, i13, i14, i15, i16, i17, i18, i19, i20, i21, i22, i23) #endif #define convKernelForLoopXx16(rnum, wsize, rtype, off0, off1, off2, off3, off4) \ - __asm__ __volatile__("vmovups (%[filter]), "#rtype"24 \n\t" \ - "addq $"#off1", %[filter] \n\t" \ - "mov $1, %%eax \n\t" \ - "vmovd %%eax, %%xmm0 \n\t" \ - "vpbroadcastw %%xmm0, "#rtype"31 \n\t" \ - "movq %[flags], %%rax \n\t" \ - "andq $0x1, %%rax \n\t" \ - "jne 0f \n\t" \ - load16BiasTo##rnum##Regs(%[bias], rtype) \ - "cmpq $0x10, %%rcx \n\t" \ - "jl 4f \n\t" \ - "jmp 1f \n\t" \ - ".align 16 \n\t" \ - "0: \n\t" \ - clear##rnum##Regs(rtype) \ - "cmpq $0x10, %%rcx \n\t" \ - "jl 4f \n\t" \ - ".align 16 \n\t" \ - "1: \n\t" \ - "movq %[input], %%rax \n\t" \ - convKernel##wsize##x16c4(%%rax, rtype##24, off0, rtype##25, rtype) \ - "addq $0x4, %%rax \n\t" \ - convKernel##wsize##x16c4(%%rax, rtype##25, off1, rtype##24, rtype) \ - "addq $0x4, %%rax \n\t" \ - convKernel##wsize##x16c4(%%rax, rtype##24, off2, rtype##25, rtype) \ - "addq $0x4, %%rax \n\t" \ - convKernel##wsize##x16c4(%%rax, rtype##25, off3, rtype##24, rtype) \ - "addq $"#off4", %[filter] \n\t" \ - "addq %[fStep], %[input] \n\t" \ - "subq $0x10, %%rcx \n\t" \ - "cmpq $0x10, %%rcx \n\t" \ - "jge 1b \n\t" \ - "subq %[fStep], %[input] \n\t" \ - "addq %[f8Step], %[input] \n\t" \ - ".align 16 \n\t" \ - "4: \n\t" \ - : "+c" (c.ic), [input] "+r" (c.input), [filter] "+r" (c.filter) \ - : [bias] "r" (c.bias), [kh] "r" (c.kh), [kw] "r" (c.kw), \ - [fStep] "r" (c.fStep), [flags] "r" (c.flags), \ - [f8Step] "r" (c.f8Step) \ - : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \ - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", \ - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \ - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \ - "%zmm31", "memory", "cc"); \ - if (c.ic > 0) { \ - __asm__ __volatile__("cmpq $0x8, %%rcx \n\t" \ - "jl 2f \n\t" \ - "subq $0x8, %%rcx \n\t" \ - "movq %[input], %%rax \n\t" \ - convKernel##wsize##x16c4(%%rax, rtype##24, off0, rtype##25, rtype) \ - "addq $0x4, %%rax \n\t" \ - convKernel##wsize##x16c4(%%rax, rtype##25, off1, rtype##24, rtype) \ - "addq $"#off2", %[filter] \n\t" \ - "addq %[f4Step], %[input] \n\t" \ - ".align 16 \n\t" \ - "2: \n\t" \ - "cmpq $0x4, %%rcx \n\t" \ - "jl 5f \n\t" \ - convKernel##wsize##x16c4(%[input], rtype##24, off0, rtype##25, rtype) \ - ".align 16 \n\t" \ - "5: \n\t" \ - : "+c" (c.ic) \ - : [input] "r" (c.input), [filter] "r" (c.filter), [bias] "r" (c.bias), [kh] "r" (c.kh), [kw] "r" (c.kw), \ - [stepC16] "r" (c.stepC16), [dilateW] "r" (c.dilateW), \ - [dilateH] "r" (c.dilateH), [fStep] "r" (c.fStep), \ - [f4Step] "r" (c.f4Step) \ - : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \ - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", \ - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \ - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \ - "%zmm31", "memory", "cc"); \ + __asm__ __volatile__("vmovups (%[filter]), "#rtype"24 \n\t" \ + "addq $"#off1", %[filter] \n\t" \ + "mov $1, %%eax \n\t" \ + "vmovd %%eax, %%xmm0 \n\t" \ + "vpbroadcastw %%xmm0, "#rtype"31 \n\t" \ + "movq %[flags], %%rax \n\t" \ + "andq $0x1, %%rax \n\t" \ + "jne 0f \n\t" \ + load16BiasTo##rnum##Regs(%[bias], rtype) \ + "cmpq $0x10, %%rcx \n\t" \ + "jl 4f \n\t" \ + "jmp 1f \n\t" \ + ".align 16 \n\t" \ + "0: \n\t" \ + clear##rnum##Regs(rtype) \ + "cmpq $0x10, %%rcx \n\t" \ + "jl 4f \n\t" \ + ".align 16 \n\t" \ + "1: \n\t" \ + "movq %[input], %%rax \n\t" \ + convKernel##wsize##x16c4(%%rax, rtype##24, off0, rtype##25, rtype, \ + 0x0, 0x10, 0x20, 0x30, 0x40, 0x50, \ + 0x60, 0x70, 0x80, 0x90, 0xA0, 0xB0, \ + 0xC0, 0xD0, 0xE0, 0xF0, 0x100, 0x110, \ + 0x120, 0x130, 0x140, 0x150, 0x160, 0x170) \ + "addq $0x4, %%rax \n\t" \ + convKernel##wsize##x16c4(%%rax, rtype##25, off1, rtype##24, rtype, \ + 0x0, 0x10, 0x20, 0x30, 0x40, 0x50, \ + 0x60, 0x70, 0x80, 0x90, 0xA0, 0xB0, \ + 0xC0, 0xD0, 0xE0, 0xF0, 0x100, 0x110, \ + 0x120, 0x130, 0x140, 0x150, 0x160, 0x170) \ + "addq $0x4, %%rax \n\t" \ + convKernel##wsize##x16c4(%%rax, rtype##24, off2, rtype##25, rtype, \ + 0x0, 0x10, 0x20, 0x30, 0x40, 0x50, \ + 0x60, 0x70, 0x80, 0x90, 0xA0, 0xB0, \ + 0xC0, 0xD0, 0xE0, 0xF0, 0x100, 0x110, \ + 0x120, 0x130, 0x140, 0x150, 0x160, 0x170) \ + "addq $0x4, %%rax \n\t" \ + convKernel##wsize##x16c4(%%rax, rtype##25, off3, rtype##24, rtype, \ + 0x0, 0x10, 0x20, 0x30, 0x40, 0x50, \ + 0x60, 0x70, 0x80, 0x90, 0xA0, 0xB0, \ + 0xC0, 0xD0, 0xE0, 0xF0, 0x100, 0x110, \ + 0x120, 0x130, 0x140, 0x150, 0x160, 0x170) \ + "addq $"#off4", %[filter] \n\t" \ + "addq %[fStep], %[input] \n\t" \ + "subq $0x10, %%rcx \n\t" \ + "cmpq $0x10, %%rcx \n\t" \ + "jge 1b \n\t" \ + "subq %[fStep], %[input] \n\t" \ + "addq %[f8Step], %[input] \n\t" \ + ".align 16 \n\t" \ + "4: \n\t" \ + : "+c" (c.ic), \ + [input] "+r" (c.input), \ + [filter] "+r" (c.filter) \ + : [bias] "r" (c.bias), \ + [kh] "r" (c.kh), \ + [kw] "r" (c.kw), \ + [fStep] "r" (c.fStep), \ + [flags] "r" (c.flags), \ + [f8Step] "r" (c.f8Step) \ + : "%rax", "%rbx", "%r9", \ + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", \ + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", \ + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \ + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \ + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", \ + "%zmm30", "%zmm31", "memory", "cc"); \ + if (c.ic > 0) { \ + __asm__ __volatile__("cmpq $0x8, %%rcx \n\t" \ + "jl 2f \n\t" \ + "subq $0x8, %%rcx \n\t" \ + "movq %[input], %%rax \n\t" \ + convKernel##wsize##x16c4(%%rax, rtype##24, off0, rtype##25, rtype, \ + 0x0, 0x8, 0x10, 0x18, 0x20, 0x28, \ + 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, \ + 0x60, 0x68, 0x70, 0x78, 0x80, 0x88, \ + 0x90, 0x98, 0xA0, 0xA8, 0xB0, 0xB8) \ + "addq $0x4, %%rax \n\t" \ + convKernel##wsize##x16c4(%%rax, rtype##25, off1, rtype##24, rtype, \ + 0x0, 0x8, 0x10, 0x18, 0x20, 0x28, \ + 0x30, 0x38, 0x40, 0x48, 0x50, 0x58, \ + 0x60, 0x68, 0x70, 0x78, 0x80, 0x88, \ + 0x90, 0x98, 0xA0, 0xA8, 0xB0, 0xB8) \ + "addq $"#off2", %[filter] \n\t" \ + "addq %[f4Step], %[input] \n\t" \ + ".align 16 \n\t" \ + "2: \n\t" \ + "cmpq $0x4, %%rcx \n\t" \ + "jl 5f \n\t" \ + convKernel##wsize##x16c4(%[input], rtype##24, off0, rtype##25, rtype, \ + 0x0, 0x4, 0x8, 0xC, 0x10, 0x14, \ + 0x18, 0x1C, 0x20, 0x24, 0x28, 0x2C, \ + 0x30, 0x34, 0x38, 0x3C, 0x40, 0x44, \ + 0x48, 0x4C, 0x50, 0x54, 0x58, 0x5C) \ + ".align 16 \n\t" \ + "5: \n\t" \ + : "+c" (c.ic) \ + : [input] "r" (c.input), \ + [filter] "r" (c.filter), \ + [bias] "r" (c.bias), \ + [kh] "r" (c.kh), \ + [kw] "r" (c.kw), \ + [dilateW] "r" (c.dilateW), \ + [dilateH] "r" (c.dilateH), \ + [fStep] "r" (c.fStep), \ + [f4Step] "r" (c.f4Step) \ + : "%rax", "%rbx", "%r9", \ + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", \ + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", \ + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \ + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \ + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", \ + "%zmm30", "%zmm31", "memory", "cc"); \ } void Avx512Conv1x1Kernel24x16(ConvController &c) { convKernelForLoopXx16(24, 24, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100) - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" - "je 0f \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd 0x40(%%rax), %%zmm1, %%zmm1 \n\t" - "vpaddd 0x80(%%rax), %%zmm2, %%zmm2 \n\t" - "vpaddd 0xC0(%%rax), %%zmm3, %%zmm3 \n\t" - "vpaddd 0x100(%%rax), %%zmm4, %%zmm4 \n\t" - "vpaddd 0x140(%%rax), %%zmm5, %%zmm5 \n\t" - "vpaddd 0x180(%%rax), %%zmm6, %%zmm6 \n\t" - "vpaddd 0x1C0(%%rax), %%zmm7, %%zmm7 \n\t" - "vpaddd 0x200(%%rax), %%zmm8, %%zmm8 \n\t" - "vpaddd 0x240(%%rax), %%zmm9, %%zmm9 \n\t" - "vpaddd 0x280(%%rax), %%zmm10, %%zmm10 \n\t" - "vpaddd 0x2C0(%%rax), %%zmm11, %%zmm11 \n\t" - "vpaddd 0x300(%%rax), %%zmm12, %%zmm12 \n\t" - "vpaddd 0x340(%%rax), %%zmm13, %%zmm13 \n\t" - "vpaddd 0x380(%%rax), %%zmm14, %%zmm14 \n\t" - "vpaddd 0x3C0(%%rax), %%zmm15, %%zmm15 \n\t" - "vpaddd 0x400(%%rax), %%zmm16, %%zmm16 \n\t" - "vpaddd 0x440(%%rax), %%zmm17, %%zmm17 \n\t" - "vpaddd 0x480(%%rax), %%zmm18, %%zmm18 \n\t" - "vpaddd 0x4C0(%%rax), %%zmm19, %%zmm19 \n\t" - "vpaddd 0x500(%%rax), %%zmm20, %%zmm20 \n\t" - "vpaddd 0x540(%%rax), %%zmm21, %%zmm21 \n\t" - "vpaddd 0x580(%%rax), %%zmm22, %%zmm22 \n\t" - "vpaddd 0x5C0(%%rax), %%zmm23, %%zmm23 \n\t" - - ".align 16 \n\t" - "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" + "je 0f \n\t" + "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" + "vpaddd 0x40(%%rax), %%zmm1, %%zmm1 \n\t" + "vpaddd 0x80(%%rax), %%zmm2, %%zmm2 \n\t" + "vpaddd 0xC0(%%rax), %%zmm3, %%zmm3 \n\t" + "vpaddd 0x100(%%rax), %%zmm4, %%zmm4 \n\t" + "vpaddd 0x140(%%rax), %%zmm5, %%zmm5 \n\t" + "vpaddd 0x180(%%rax), %%zmm6, %%zmm6 \n\t" + "vpaddd 0x1C0(%%rax), %%zmm7, %%zmm7 \n\t" + "vpaddd 0x200(%%rax), %%zmm8, %%zmm8 \n\t" + "vpaddd 0x240(%%rax), %%zmm9, %%zmm9 \n\t" + "vpaddd 0x280(%%rax), %%zmm10, %%zmm10 \n\t" + "vpaddd 0x2C0(%%rax), %%zmm11, %%zmm11 \n\t" + "vpaddd 0x300(%%rax), %%zmm12, %%zmm12 \n\t" + "vpaddd 0x340(%%rax), %%zmm13, %%zmm13 \n\t" + "vpaddd 0x380(%%rax), %%zmm14, %%zmm14 \n\t" + "vpaddd 0x3C0(%%rax), %%zmm15, %%zmm15 \n\t" + "vpaddd 0x400(%%rax), %%zmm16, %%zmm16 \n\t" + "vpaddd 0x440(%%rax), %%zmm17, %%zmm17 \n\t" + "vpaddd 0x480(%%rax), %%zmm18, %%zmm18 \n\t" + "vpaddd 0x4C0(%%rax), %%zmm19, %%zmm19 \n\t" + "vpaddd 0x500(%%rax), %%zmm20, %%zmm20 \n\t" + "vpaddd 0x540(%%rax), %%zmm21, %%zmm21 \n\t" + "vpaddd 0x580(%%rax), %%zmm22, %%zmm22 \n\t" + "vpaddd 0x5C0(%%rax), %%zmm23, %%zmm23 \n\t" + + ".align 16 \n\t" + "0: \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" relu24Regs(%%zmm) + "jmp 4f \n\t" - ".align 16 \n\t" - "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" + ".align 16 \n\t" + "1: \n\t" convert24RegsI32ToF32(%[scale], %%zmm) - ".align 16 \n\t" - "2: \n\t" - "vmovups %%zmm0, (%%rax) \n\t" - "vmovups %%zmm1, 0x40(%%rax) \n\t" - "vmovups %%zmm2, 0x80(%%rax) \n\t" - "vmovups %%zmm3, 0xC0(%%rax) \n\t" - "vmovups %%zmm4, 0x100(%%rax) \n\t" - "vmovups %%zmm5, 0x140(%%rax) \n\t" - "vmovups %%zmm6, 0x180(%%rax) \n\t" - "vmovups %%zmm7, 0x1C0(%%rax) \n\t" - "vmovups %%zmm8, 0x200(%%rax) \n\t" - "vmovups %%zmm9, 0x240(%%rax) \n\t" - "vmovups %%zmm10, 0x280(%%rax) \n\t" - "vmovups %%zmm11, 0x2C0(%%rax) \n\t" - "vmovups %%zmm12, 0x300(%%rax) \n\t" - "vmovups %%zmm13, 0x340(%%rax) \n\t" - "vmovups %%zmm14, 0x380(%%rax) \n\t" - "vmovups %%zmm15, 0x3C0(%%rax) \n\t" - "vmovups %%zmm16, 0x400(%%rax) \n\t" - "vmovups %%zmm17, 0x440(%%rax) \n\t" - "vmovups %%zmm18, 0x480(%%rax) \n\t" - "vmovups %%zmm19, 0x4C0(%%rax) \n\t" - "vmovups %%zmm20, 0x500(%%rax) \n\t" - "vmovups %%zmm21, 0x540(%%rax) \n\t" - "vmovups %%zmm22, 0x580(%%rax) \n\t" - "vmovups %%zmm23, 0x5C0(%%rax) \n\t" + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + "vaddps 0x40(%[eltwise]), %%zmm1, %%zmm1 \n\t" + "vaddps 0x80(%[eltwise]), %%zmm2, %%zmm2 \n\t" + "vaddps 0xC0(%[eltwise]), %%zmm3, %%zmm3 \n\t" + "vaddps 0x100(%[eltwise]), %%zmm4, %%zmm4 \n\t" + "vaddps 0x140(%[eltwise]), %%zmm5, %%zmm5 \n\t" + "vaddps 0x180(%[eltwise]), %%zmm6, %%zmm6 \n\t" + "vaddps 0x1C0(%[eltwise]), %%zmm7, %%zmm7 \n\t" + "vaddps 0x200(%[eltwise]), %%zmm8, %%zmm8 \n\t" + "vaddps 0x240(%[eltwise]), %%zmm9, %%zmm9 \n\t" + "vaddps 0x280(%[eltwise]), %%zmm10, %%zmm10 \n\t" + "vaddps 0x2C0(%[eltwise]), %%zmm11, %%zmm11 \n\t" + "vaddps 0x300(%[eltwise]), %%zmm12, %%zmm12 \n\t" + "vaddps 0x340(%[eltwise]), %%zmm13, %%zmm13 \n\t" + "vaddps 0x380(%[eltwise]), %%zmm14, %%zmm14 \n\t" + "vaddps 0x3C0(%[eltwise]), %%zmm15, %%zmm15 \n\t" + "vaddps 0x400(%[eltwise]), %%zmm16, %%zmm16 \n\t" + "vaddps 0x440(%[eltwise]), %%zmm17, %%zmm17 \n\t" + "vaddps 0x480(%[eltwise]), %%zmm18, %%zmm18 \n\t" + "vaddps 0x4C0(%[eltwise]), %%zmm19, %%zmm19 \n\t" + "vaddps 0x500(%[eltwise]), %%zmm20, %%zmm20 \n\t" + "vaddps 0x540(%[eltwise]), %%zmm21, %%zmm21 \n\t" + "vaddps 0x580(%[eltwise]), %%zmm22, %%zmm22 \n\t" + "vaddps 0x5C0(%[eltwise]), %%zmm23, %%zmm23 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + relu24RegsPs(%%zmm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%zmm0, (%%rax) \n\t" + "vmovups %%zmm1, 0x40(%%rax) \n\t" + "vmovups %%zmm2, 0x80(%%rax) \n\t" + "vmovups %%zmm3, 0xC0(%%rax) \n\t" + "vmovups %%zmm4, 0x100(%%rax) \n\t" + "vmovups %%zmm5, 0x140(%%rax) \n\t" + "vmovups %%zmm6, 0x180(%%rax) \n\t" + "vmovups %%zmm7, 0x1C0(%%rax) \n\t" + "vmovups %%zmm8, 0x200(%%rax) \n\t" + "vmovups %%zmm9, 0x240(%%rax) \n\t" + "vmovups %%zmm10, 0x280(%%rax) \n\t" + "vmovups %%zmm11, 0x2C0(%%rax) \n\t" + "vmovups %%zmm12, 0x300(%%rax) \n\t" + "vmovups %%zmm13, 0x340(%%rax) \n\t" + "vmovups %%zmm14, 0x380(%%rax) \n\t" + "vmovups %%zmm15, 0x3C0(%%rax) \n\t" + "vmovups %%zmm16, 0x400(%%rax) \n\t" + "vmovups %%zmm17, 0x440(%%rax) \n\t" + "vmovups %%zmm18, 0x480(%%rax) \n\t" + "vmovups %%zmm19, 0x4C0(%%rax) \n\t" + "vmovups %%zmm20, 0x500(%%rax) \n\t" + "vmovups %%zmm21, 0x540(%%rax) \n\t" + "vmovups %%zmm22, 0x580(%%rax) \n\t" + "vmovups %%zmm23, 0x5C0(%%rax) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [ostepC16] "r" (c.ostepC16), + [eltwise] "r" (c.eltwise), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", + "%zmm30","%zmm31", "memory", "cc"); } void Avx512Conv1x1Kernel12x16(ConvController &c) { convKernelForLoopXx16(12, 12, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100) - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" - "je 0f \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd 0x40(%%rax), %%zmm1, %%zmm1 \n\t" - "vpaddd 0x80(%%rax), %%zmm2, %%zmm2 \n\t" - "vpaddd 0xC0(%%rax), %%zmm3, %%zmm3 \n\t" - "vpaddd 0x100(%%rax), %%zmm4, %%zmm4 \n\t" - "vpaddd 0x140(%%rax), %%zmm5, %%zmm5 \n\t" - "vpaddd 0x180(%%rax), %%zmm6, %%zmm6 \n\t" - "vpaddd 0x1C0(%%rax), %%zmm7, %%zmm7 \n\t" - "vpaddd 0x200(%%rax), %%zmm8, %%zmm8 \n\t" - "vpaddd 0x240(%%rax), %%zmm9, %%zmm9 \n\t" - "vpaddd 0x280(%%rax), %%zmm10, %%zmm10 \n\t" - "vpaddd 0x2C0(%%rax), %%zmm11, %%zmm11 \n\t" - - ".align 16 \n\t" - "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" + "je 0f \n\t" + "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" + "vpaddd 0x40(%%rax), %%zmm1, %%zmm1 \n\t" + "vpaddd 0x80(%%rax), %%zmm2, %%zmm2 \n\t" + "vpaddd 0xC0(%%rax), %%zmm3, %%zmm3 \n\t" + "vpaddd 0x100(%%rax), %%zmm4, %%zmm4 \n\t" + "vpaddd 0x140(%%rax), %%zmm5, %%zmm5 \n\t" + "vpaddd 0x180(%%rax), %%zmm6, %%zmm6 \n\t" + "vpaddd 0x1C0(%%rax), %%zmm7, %%zmm7 \n\t" + "vpaddd 0x200(%%rax), %%zmm8, %%zmm8 \n\t" + "vpaddd 0x240(%%rax), %%zmm9, %%zmm9 \n\t" + "vpaddd 0x280(%%rax), %%zmm10, %%zmm10 \n\t" + "vpaddd 0x2C0(%%rax), %%zmm11, %%zmm11 \n\t" + + ".align 16 \n\t" + "0: \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" relu12Regs(%%zmm) + "jmp 4f \n\t" - ".align 16 \n\t" - "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" + ".align 16 \n\t" + "1: \n\t" convert12RegsI32ToF32(%[scale], %%zmm) - ".align 16 \n\t" - "2: \n\t" - "vmovups %%zmm0, (%%rax) \n\t" - "vmovups %%zmm1, 0x40(%%rax) \n\t" - "vmovups %%zmm2, 0x80(%%rax) \n\t" - "vmovups %%zmm3, 0xC0(%%rax) \n\t" - "vmovups %%zmm4, 0x100(%%rax) \n\t" - "vmovups %%zmm5, 0x140(%%rax) \n\t" - "vmovups %%zmm6, 0x180(%%rax) \n\t" - "vmovups %%zmm7, 0x1C0(%%rax) \n\t" - "vmovups %%zmm8, 0x200(%%rax) \n\t" - "vmovups %%zmm9, 0x240(%%rax) \n\t" - "vmovups %%zmm10, 0x280(%%rax) \n\t" - "vmovups %%zmm11, 0x2C0(%%rax) \n\t" + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + "vaddps 0x40(%[eltwise]), %%zmm1, %%zmm1 \n\t" + "vaddps 0x80(%[eltwise]), %%zmm2, %%zmm2 \n\t" + "vaddps 0xC0(%[eltwise]), %%zmm3, %%zmm3 \n\t" + "vaddps 0x100(%[eltwise]), %%zmm4, %%zmm4 \n\t" + "vaddps 0x140(%[eltwise]), %%zmm5, %%zmm5 \n\t" + "vaddps 0x180(%[eltwise]), %%zmm6, %%zmm6 \n\t" + "vaddps 0x1C0(%[eltwise]), %%zmm7, %%zmm7 \n\t" + "vaddps 0x200(%[eltwise]), %%zmm8, %%zmm8 \n\t" + "vaddps 0x240(%[eltwise]), %%zmm9, %%zmm9 \n\t" + "vaddps 0x280(%[eltwise]), %%zmm10, %%zmm10 \n\t" + "vaddps 0x2C0(%[eltwise]), %%zmm11, %%zmm11 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + relu12RegsPs(%%zmm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%zmm0, (%%rax) \n\t" + "vmovups %%zmm1, 0x40(%%rax) \n\t" + "vmovups %%zmm2, 0x80(%%rax) \n\t" + "vmovups %%zmm3, 0xC0(%%rax) \n\t" + "vmovups %%zmm4, 0x100(%%rax) \n\t" + "vmovups %%zmm5, 0x140(%%rax) \n\t" + "vmovups %%zmm6, 0x180(%%rax) \n\t" + "vmovups %%zmm7, 0x1C0(%%rax) \n\t" + "vmovups %%zmm8, 0x200(%%rax) \n\t" + "vmovups %%zmm9, 0x240(%%rax) \n\t" + "vmovups %%zmm10, 0x280(%%rax) \n\t" + "vmovups %%zmm11, 0x2C0(%%rax) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [ostepC16] "r" (c.ostepC16), + [eltwise] "r" (c.eltwise), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", + "%zmm6","%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", + "%zmm24", "%zmm31", "memory", "cc"); } void Avx512Conv1x1Kernel1x16(ConvController &c) { convKernelForLoopXx16(1, 1, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100) - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" - "je 0f \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd 0x40(%%rax), %%zmm1, %%zmm1 \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" + "je 0f \n\t" + "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" + "vpaddd 0x40(%%rax), %%zmm1, %%zmm1 \n\t" - ".align 16 \n\t" - "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" reluReg(%%zmm) + "jmp 4f \n\t" - ".align 16 \n\t" - "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" + ".align 16 \n\t" + "1: \n\t" convertRegI32ToF32(%[scale], %%zmm) - ".align 16 \n\t" - "2: \n\t" - "vmovups %%zmm0, (%%rax) \n\t" + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + reluRegPs(%%zmm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%zmm0, (%%rax) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [eltwise] "r" (c.eltwise), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%zmm0","%zmm24", "%zmm31", "memory", "cc"); } void Avx512Conv1x1Kernel24x8(ConvController &c) { convKernelForLoopXx16(24, 24, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80) - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" - "je 0f \n\t" - "vpaddd (%%rax), %%ymm0, %%ymm0 \n\t" - "vpaddd 0x20(%%rax), %%ymm1, %%ymm1 \n\t" - "vpaddd 0x40(%%rax), %%ymm2, %%ymm2 \n\t" - "vpaddd 0x60(%%rax), %%ymm3, %%ymm3 \n\t" - "vpaddd 0x80(%%rax), %%ymm4, %%ymm4 \n\t" - "vpaddd 0xA0(%%rax), %%ymm5, %%ymm5 \n\t" - "vpaddd 0xC0(%%rax), %%ymm6, %%ymm6 \n\t" - "vpaddd 0xE0(%%rax), %%ymm7, %%ymm7 \n\t" - "vpaddd 0x100(%%rax), %%ymm8, %%ymm8 \n\t" - "vpaddd 0x120(%%rax), %%ymm9, %%ymm9 \n\t" - "vpaddd 0x140(%%rax), %%ymm10, %%ymm10 \n\t" - "vpaddd 0x160(%%rax), %%ymm11, %%ymm11 \n\t" - "vpaddd 0x180(%%rax), %%ymm12, %%ymm12 \n\t" - "vpaddd 0x1A0(%%rax), %%ymm13, %%ymm13 \n\t" - "vpaddd 0x1C0(%%rax), %%ymm14, %%ymm14 \n\t" - "vpaddd 0x1E0(%%rax), %%ymm15, %%ymm15 \n\t" - "vpaddd 0x200(%%rax), %%ymm16, %%ymm16 \n\t" - "vpaddd 0x220(%%rax), %%ymm17, %%ymm17 \n\t" - "vpaddd 0x240(%%rax), %%ymm18, %%ymm18 \n\t" - "vpaddd 0x260(%%rax), %%ymm19, %%ymm19 \n\t" - "vpaddd 0x280(%%rax), %%ymm20, %%ymm20 \n\t" - "vpaddd 0x2A0(%%rax), %%ymm21, %%ymm21 \n\t" - "vpaddd 0x2C0(%%rax), %%ymm22, %%ymm22 \n\t" - "vpaddd 0x2E0(%%rax), %%ymm23, %%ymm23 \n\t" - - ".align 16 \n\t" - "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" + "je 0f \n\t" + "vpaddd (%%rax), %%ymm0, %%ymm0 \n\t" + "vpaddd 0x20(%%rax), %%ymm1, %%ymm1 \n\t" + "vpaddd 0x40(%%rax), %%ymm2, %%ymm2 \n\t" + "vpaddd 0x60(%%rax), %%ymm3, %%ymm3 \n\t" + "vpaddd 0x80(%%rax), %%ymm4, %%ymm4 \n\t" + "vpaddd 0xA0(%%rax), %%ymm5, %%ymm5 \n\t" + "vpaddd 0xC0(%%rax), %%ymm6, %%ymm6 \n\t" + "vpaddd 0xE0(%%rax), %%ymm7, %%ymm7 \n\t" + "vpaddd 0x100(%%rax), %%ymm8, %%ymm8 \n\t" + "vpaddd 0x120(%%rax), %%ymm9, %%ymm9 \n\t" + "vpaddd 0x140(%%rax), %%ymm10, %%ymm10 \n\t" + "vpaddd 0x160(%%rax), %%ymm11, %%ymm11 \n\t" + "vpaddd 0x180(%%rax), %%ymm12, %%ymm12 \n\t" + "vpaddd 0x1A0(%%rax), %%ymm13, %%ymm13 \n\t" + "vpaddd 0x1C0(%%rax), %%ymm14, %%ymm14 \n\t" + "vpaddd 0x1E0(%%rax), %%ymm15, %%ymm15 \n\t" + "vpaddd 0x200(%%rax), %%ymm16, %%ymm16 \n\t" + "vpaddd 0x220(%%rax), %%ymm17, %%ymm17 \n\t" + "vpaddd 0x240(%%rax), %%ymm18, %%ymm18 \n\t" + "vpaddd 0x260(%%rax), %%ymm19, %%ymm19 \n\t" + "vpaddd 0x280(%%rax), %%ymm20, %%ymm20 \n\t" + "vpaddd 0x2A0(%%rax), %%ymm21, %%ymm21 \n\t" + "vpaddd 0x2C0(%%rax), %%ymm22, %%ymm22 \n\t" + "vpaddd 0x2E0(%%rax), %%ymm23, %%ymm23 \n\t" + + ".align 16 \n\t" + "0: \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" relu24Regs(%%ymm) + "jmp 4f \n\t" - ".align 16 \n\t" - "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" + ".align 16 \n\t" + "1: \n\t" convert24RegsI32ToF32(%[scale], %%ymm) - ".align 16 \n\t" - "2: \n\t" - "vmovups %%ymm0, (%%rax) \n\t" - "vmovups %%ymm1, 0x20(%%rax) \n\t" - "vmovups %%ymm2, 0x40(%%rax) \n\t" - "vmovups %%ymm3, 0x60(%%rax) \n\t" - "vmovups %%ymm4, 0x80(%%rax) \n\t" - "vmovups %%ymm5, 0xA0(%%rax) \n\t" - "vmovups %%ymm6, 0xC0(%%rax) \n\t" - "vmovups %%ymm7, 0xE0(%%rax) \n\t" - "vmovups %%ymm8, 0x100(%%rax) \n\t" - "vmovups %%ymm9, 0x120(%%rax) \n\t" - "vmovups %%ymm10, 0x140(%%rax) \n\t" - "vmovups %%ymm11, 0x160(%%rax) \n\t" - "vmovups %%ymm12, 0x180(%%rax) \n\t" - "vmovups %%ymm13, 0x1A0(%%rax) \n\t" - "vmovups %%ymm14, 0x1C0(%%rax) \n\t" - "vmovups %%ymm15, 0x1E0(%%rax) \n\t" - "vmovups %%ymm16, 0x200(%%rax) \n\t" - "vmovups %%ymm17, 0x220(%%rax) \n\t" - "vmovups %%ymm18, 0x240(%%rax) \n\t" - "vmovups %%ymm19, 0x260(%%rax) \n\t" - "vmovups %%ymm20, 0x280(%%rax) \n\t" - "vmovups %%ymm21, 0x2A0(%%rax) \n\t" - "vmovups %%ymm22, 0x2C0(%%rax) \n\t" - "vmovups %%ymm23, 0x2E0(%%rax) \n\t" + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%ymm0, %%ymm0 \n\t" + "vaddps 0x20(%[eltwise]), %%ymm1, %%ymm1 \n\t" + "vaddps 0x40(%[eltwise]), %%ymm2, %%ymm2 \n\t" + "vaddps 0x60(%[eltwise]), %%ymm3, %%ymm3 \n\t" + "vaddps 0x80(%[eltwise]), %%ymm4, %%ymm4 \n\t" + "vaddps 0xA0(%[eltwise]), %%ymm5, %%ymm5 \n\t" + "vaddps 0xC0(%[eltwise]), %%ymm6, %%ymm6 \n\t" + "vaddps 0xE0(%[eltwise]), %%ymm7, %%ymm7 \n\t" + "vaddps 0x100(%[eltwise]), %%ymm8, %%ymm8 \n\t" + "vaddps 0x120(%[eltwise]), %%ymm9, %%ymm9 \n\t" + "vaddps 0x140(%[eltwise]), %%ymm10, %%ymm10 \n\t" + "vaddps 0x160(%[eltwise]), %%ymm11, %%ymm11 \n\t" + "vaddps 0x180(%[eltwise]), %%ymm12, %%ymm12 \n\t" + "vaddps 0x1A0(%[eltwise]), %%ymm13, %%ymm13 \n\t" + "vaddps 0x1C0(%[eltwise]), %%ymm14, %%ymm14 \n\t" + "vaddps 0x1E0(%[eltwise]), %%ymm15, %%ymm15 \n\t" + "vaddps 0x200(%[eltwise]), %%ymm16, %%ymm16 \n\t" + "vaddps 0x220(%[eltwise]), %%ymm17, %%ymm17 \n\t" + "vaddps 0x240(%[eltwise]), %%ymm18, %%ymm18 \n\t" + "vaddps 0x260(%[eltwise]), %%ymm19, %%ymm19 \n\t" + "vaddps 0x280(%[eltwise]), %%ymm20, %%ymm20 \n\t" + "vaddps 0x2A0(%[eltwise]), %%ymm21, %%ymm21 \n\t" + "vaddps 0x2C0(%[eltwise]), %%ymm22, %%ymm22 \n\t" + "vaddps 0x2E0(%[eltwise]), %%ymm23, %%ymm23 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + relu24RegsPs(%%ymm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%ymm0, (%%rax) \n\t" + "vmovups %%ymm1, 0x20(%%rax) \n\t" + "vmovups %%ymm2, 0x40(%%rax) \n\t" + "vmovups %%ymm3, 0x60(%%rax) \n\t" + "vmovups %%ymm4, 0x80(%%rax) \n\t" + "vmovups %%ymm5, 0xA0(%%rax) \n\t" + "vmovups %%ymm6, 0xC0(%%rax) \n\t" + "vmovups %%ymm7, 0xE0(%%rax) \n\t" + "vmovups %%ymm8, 0x100(%%rax) \n\t" + "vmovups %%ymm9, 0x120(%%rax) \n\t" + "vmovups %%ymm10, 0x140(%%rax) \n\t" + "vmovups %%ymm11, 0x160(%%rax) \n\t" + "vmovups %%ymm12, 0x180(%%rax) \n\t" + "vmovups %%ymm13, 0x1A0(%%rax) \n\t" + "vmovups %%ymm14, 0x1C0(%%rax) \n\t" + "vmovups %%ymm15, 0x1E0(%%rax) \n\t" + "vmovups %%ymm16, 0x200(%%rax) \n\t" + "vmovups %%ymm17, 0x220(%%rax) \n\t" + "vmovups %%ymm18, 0x240(%%rax) \n\t" + "vmovups %%ymm19, 0x260(%%rax) \n\t" + "vmovups %%ymm20, 0x280(%%rax) \n\t" + "vmovups %%ymm21, 0x2A0(%%rax) \n\t" + "vmovups %%ymm22, 0x2C0(%%rax) \n\t" + "vmovups %%ymm23, 0x2E0(%%rax) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", - "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", - "%ymm15", "%ymm16", "%ymm17", "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22", - "%ymm23", "%ymm24", "%ymm25", "%ymm26", "%ymm27", "%ymm28", "%ymm29", "%ymm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [ostepC16] "r" (c.ostepC16), + [eltwise] "r" (c.eltwise), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", "%ymm16", "%ymm17", + "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22", "%ymm23", + "%ymm24", "%ymm25", "%ymm26", "%ymm27", "%ymm28", "%ymm29", + "%ymm30", "%ymm31", "memory", "cc"); } void Avx512Conv1x1Kernel12x8(ConvController &c) { convKernelForLoopXx16(12, 12, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80) - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" - "je 0f \n\t" - "vpaddd (%%rax), %%ymm0, %%ymm0 \n\t" - "vpaddd 0x20(%%rax), %%ymm1, %%ymm1 \n\t" - "vpaddd 0x40(%%rax), %%ymm2, %%ymm2 \n\t" - "vpaddd 0x60(%%rax), %%ymm3, %%ymm3 \n\t" - "vpaddd 0x80(%%rax), %%ymm4, %%ymm4 \n\t" - "vpaddd 0xA0(%%rax), %%ymm5, %%ymm5 \n\t" - "vpaddd 0xC0(%%rax), %%ymm6, %%ymm6 \n\t" - "vpaddd 0xE0(%%rax), %%ymm7, %%ymm7 \n\t" - "vpaddd 0x100(%%rax), %%ymm8, %%ymm8 \n\t" - "vpaddd 0x120(%%rax), %%ymm9, %%ymm9 \n\t" - "vpaddd 0x140(%%rax), %%ymm10, %%ymm10 \n\t" - "vpaddd 0x160(%%rax), %%ymm11, %%ymm11 \n\t" - - ".align 16 \n\t" - "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" + "je 0f \n\t" + "vpaddd (%%rax), %%ymm0, %%ymm0 \n\t" + "vpaddd 0x20(%%rax), %%ymm1, %%ymm1 \n\t" + "vpaddd 0x40(%%rax), %%ymm2, %%ymm2 \n\t" + "vpaddd 0x60(%%rax), %%ymm3, %%ymm3 \n\t" + "vpaddd 0x80(%%rax), %%ymm4, %%ymm4 \n\t" + "vpaddd 0xA0(%%rax), %%ymm5, %%ymm5 \n\t" + "vpaddd 0xC0(%%rax), %%ymm6, %%ymm6 \n\t" + "vpaddd 0xE0(%%rax), %%ymm7, %%ymm7 \n\t" + "vpaddd 0x100(%%rax), %%ymm8, %%ymm8 \n\t" + "vpaddd 0x120(%%rax), %%ymm9, %%ymm9 \n\t" + "vpaddd 0x140(%%rax), %%ymm10, %%ymm10 \n\t" + "vpaddd 0x160(%%rax), %%ymm11, %%ymm11 \n\t" + + ".align 16 \n\t" + "0: \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" relu12Regs(%%ymm) + "jmp 4f \n\t" - ".align 16 \n\t" - "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" + ".align 16 \n\t" + "1: \n\t" convert12RegsI32ToF32(%[scale], %%ymm) - ".align 16 \n\t" - "2: \n\t" - "vmovups %%ymm0, (%%rax) \n\t" - "vmovups %%ymm1, 0x20(%%rax) \n\t" - "vmovups %%ymm2, 0x40(%%rax) \n\t" - "vmovups %%ymm3, 0x60(%%rax) \n\t" - "vmovups %%ymm4, 0x80(%%rax) \n\t" - "vmovups %%ymm5, 0xA0(%%rax) \n\t" - "vmovups %%ymm6, 0xC0(%%rax) \n\t" - "vmovups %%ymm7, 0xE0(%%rax) \n\t" - "vmovups %%ymm8, 0x100(%%rax) \n\t" - "vmovups %%ymm9, 0x120(%%rax) \n\t" - "vmovups %%ymm10, 0x140(%%rax) \n\t" - "vmovups %%ymm11, 0x160(%%rax) \n\t" + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%ymm0, %%ymm0 \n\t" + "vaddps 0x20(%[eltwise]), %%ymm1, %%ymm1 \n\t" + "vaddps 0x40(%[eltwise]), %%ymm2, %%ymm2 \n\t" + "vaddps 0x60(%[eltwise]), %%ymm3, %%ymm3 \n\t" + "vaddps 0x80(%[eltwise]), %%ymm4, %%ymm4 \n\t" + "vaddps 0xA0(%[eltwise]), %%ymm5, %%ymm5 \n\t" + "vaddps 0xC0(%[eltwise]), %%ymm6, %%ymm6 \n\t" + "vaddps 0xE0(%[eltwise]), %%ymm7, %%ymm7 \n\t" + "vaddps 0x100(%[eltwise]), %%ymm8, %%ymm8 \n\t" + "vaddps 0x120(%[eltwise]), %%ymm9, %%ymm9 \n\t" + "vaddps 0x140(%[eltwise]), %%ymm10, %%ymm10 \n\t" + "vaddps 0x160(%[eltwise]), %%ymm11, %%ymm11 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + relu24RegsPs(%%ymm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%ymm0, (%%rax) \n\t" + "vmovups %%ymm1, 0x20(%%rax) \n\t" + "vmovups %%ymm2, 0x40(%%rax) \n\t" + "vmovups %%ymm3, 0x60(%%rax) \n\t" + "vmovups %%ymm4, 0x80(%%rax) \n\t" + "vmovups %%ymm5, 0xA0(%%rax) \n\t" + "vmovups %%ymm6, 0xC0(%%rax) \n\t" + "vmovups %%ymm7, 0xE0(%%rax) \n\t" + "vmovups %%ymm8, 0x100(%%rax) \n\t" + "vmovups %%ymm9, 0x120(%%rax) \n\t" + "vmovups %%ymm10, 0x140(%%rax) \n\t" + "vmovups %%ymm11, 0x160(%%rax) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", - "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", - "%ymm15", "%ymm16", "%ymm17", "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22", - "%ymm23", "%ymm24", "%ymm25", "%ymm26", "%ymm27", "%ymm28", "%ymm29", "%ymm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [ostepC16] "r" (c.ostepC16), + [eltwise] "r" (c.eltwise), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", + "%ymm24", "%ymm31", "memory", "cc"); } void Avx512Conv1x1Kernel1x8(ConvController &c) { convKernelForLoopXx16(1, 1, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80) - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" - "je 0f \n\t" - "vpaddd (%%rax), %%ymm0, %%ymm0 \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" + "je 0f \n\t" + "vpaddd (%%rax), %%ymm0, %%ymm0 \n\t" - ".align 16 \n\t" - "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" reluReg(%%ymm) + "jmp 4f \n\t" - ".align 16 \n\t" - "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" + ".align 16 \n\t" + "1: \n\t" convertRegI32ToF32(%[scale], %%ymm) - ".align 16 \n\t" - "2: \n\t" + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%ymm0, %%ymm0 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + reluRegPs(%%ymm) + + ".align 16 \n\t" + "4: \n\t" "vmovups %%ymm0, (%%rax) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", - "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", - "%ymm15", "%ymm16", "%ymm17", "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22", - "%ymm23", "%ymm24", "%ymm25", "%ymm26", "%ymm27", "%ymm28", "%ymm29", "%ymm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [ostepC16] "r" (c.ostepC16), + [eltwise] "r" (c.eltwise), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%ymm0", "%ymm24", "%ymm31", "memory", "cc"); +} + +template +EE activateBias(const T1 *biasArray, T1 *activatedArray, U32 len, ActivationMode mode) { + switch (mode) { + case ACTIVATION_RELU: { + for (U32 ocb = 0; ocb < len; ++ocb) { + activatedArray[ocb] = (biasArray[ocb] <= 0)? 0: biasArray[ocb]; + } + break; + } + case ACTIVATION_RELU6: { + for (U32 ocb = 0; ocb < len; ++ocb) { + activatedArray[ocb] = + (biasArray[ocb] <= 0)? 0: ((biasArray[ocb] >= 6)? 6: biasArray[ocb]); + } + break; + } + default: + return NOT_SUPPORTED; + } + return SUCCESS; +} + +inline void getActivatedBiasForPadding( + const F32 *biasArray, TensorDesc biasDesc, DataType targetType, void *activatedBias, ActivationMode mode, F32 scaleB) +{ + if (targetType == DT_I32) { + CHECK_STATUS(quantize_bias_offsetC((const void *)biasArray, biasDesc, DT_I32, + nullptr, biasDesc, &scaleB, activatedBias)); + CHECK_STATUS(activateBias((const I32 *)activatedBias, + (I32 *)activatedBias, tensorNumElements(biasDesc), mode)); + } else if (targetType == DT_F32) { + CHECK_STATUS(activateBias((const F32 *)biasArray, + (F32 *)activatedBias, tensorNumElements(biasDesc), mode)); + } else { + CHECK_STATUS(NOT_MATCH); + } } // clang-format on EE convolution_1x1_direct(TensorDesc inputDesc, UINT8 *inArray, + F32 *eltwiseInput, TensorDesc filterDesc, const INT8 *filterArray, ConvolutionParamSpec convParamSpec, TensorDesc biasDesc, - const I32 *biasArray, + const F32 *biasArray, U32 tmpBytes, void *tmp, TensorDesc outputDesc, @@ -1967,20 +2263,17 @@ EE convolution_1x1_direct(TensorDesc inputDesc, // get computing params U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 dilateH = convParamSpec.dilatedRate_h; U32 dilateW = convParamSpec.dilatedRate_w; - U32 ih_pad = ih + paddingT + paddingB; - U32 iw_pad = iw + paddingL + paddingR; - U32 ih_stride = (ih_pad + strideH - 1) / strideH; - U32 iw_stride = (iw_pad + strideW - 1) / strideW; + U32 ih_stride = (ih + strideH - 1) / strideH; + U32 iw_stride = (iw + strideW - 1) / strideW; U32 ohow = oh * ow; UINT8 *output = (UINT8 *)outArray; - CHECK_REQUIREMENT(paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0); // infer block params // infer kernel params @@ -1989,7 +2282,6 @@ EE convolution_1x1_direct(TensorDesc inputDesc, convCtl.dilateW = dilateW * SIMDW; convCtl.dilateH = (iw_stride - fw * dilateW + (dilateH - 1) * iw_stride) * SIMDW; convCtl.fStep = ih_stride * iw_stride * SIMDW; - convCtl.stepC16 = 16; convCtl.kw = fw; convCtl.kh = fh; convCtl.scale = nullptr; @@ -2016,9 +2308,12 @@ EE convolution_1x1_direct(TensorDesc inputDesc, tmp = (void *)((U8 *)tmp + tensorNumElements(outputDesc) * bytesOf(DT_I32)); outputDesc.dt = DT_I32; } + if (eltwiseInput != nullptr) { + outputDesc.dt = DT_F32; + } F32 *factorPtr = nullptr; F32 factor = 0; - if (scale != nullptr && odt == DT_F32) { + if (scale != nullptr && outputDesc.dt == DT_F32) { factor = 1 / (*scaleO); factorPtr = &factor; } @@ -2029,12 +2324,20 @@ EE convolution_1x1_direct(TensorDesc inputDesc, (const void *)filterArray, filterDesc, scaleO, offsetC)); filterArray += oc * 4; + F32 *activatedBias = (F32 *)tmp; + if (paddingT > 0 || paddingB > 0 || paddingL > 0 || paddingR > 0) { + getActivatedBiasForPadding( + biasArray, biasDesc, outputDesc.dt, activatedBias, activationDesc.mode, *scaleO); + tmp = (void *)((U8 *)tmp + oc * bytesOf(DT_F32)); + } + U32 oBytes = bytesOf(outputDesc.dt); UINT8 *tmpInput = (UINT8 *)tmp; if (idf != DF_NCHWC16) { tmp = (void *)((U8 *)tmp + ic * ih * iw); } UINT8 *useInput = (UINT8 *)tmp; + for (U32 n = 0; n < in; ++n) { UINT8 *bInArray = inArray + n * ic * ih * iw; if (idf == DF_NCHWC16) { @@ -2053,9 +2356,9 @@ EE convolution_1x1_direct(TensorDesc inputDesc, for (U32 w = 0; w < iw_stride; ++w) { U32 nh = h * strideH; U32 nw = w * strideW; - memcpy( + UNI_MEMCPY( useInput + c * ih_stride * iw_stride * SIMDW + (h * iw_stride + w) * SIMDW, - tmpInput + c * ih_pad * iw_pad * SIMDW + (nh * iw_pad + nw) * SIMDW, SIMDW); + tmpInput + c * ih * iw * SIMDW + (nh * iw + nw) * SIMDW, SIMDW); } } } else { @@ -2068,6 +2371,7 @@ EE convolution_1x1_direct(TensorDesc inputDesc, icSize = UNI_MIN(BLOCK_IC_DIM, ic - icbb); flags |= (icbb > 0); if (icbb == (int)ic - icSize) { + flags |= (eltwiseInput != nullptr) << 1; flags |= U32(activationDesc.mode) << 2; convCtl.scale = factorPtr; } @@ -2077,35 +2381,94 @@ EE convolution_1x1_direct(TensorDesc inputDesc, if (icSize < SIMDW) { simdC = icSizeArray[icSize >> 3]; } - U32 hwSize = 0; - for (U32 hw = 0; hw < ohow; hw += hwSize) { - U32 ocSize = 0; - hwSize = UNI_MIN(BLOCK_HW_DIM, ohow - hw); - for (U32 ocb = 0; ocb < oc; ocb += ocSize) { - ocSize = UNI_MIN(unrollOc, oc - ocb); - ocSize = ocSizeArray[ocSize >> 4]; - simdOc = UNI_MIN(SIMDW, ocSize); - convCtl.bias = offsetC + ocb; - UINT8 *curI = useInput + icbb * ih_stride * iw_stride; - U32 wSize = 8; - U32 unrollW = wSizeArray[ocSize >> 4]; - for (U32 ihw = hw; ihw < hw + hwSize; ihw += wSize) { - wSize = UNI_MIN(hw + hwSize - ihw, unrollW); - U32 idx = wSize * 2 / unrollW; - wSize = UNI_MAX(idx * unrollW / 2, 1); - U32 in_h = ihw / ow; - U32 in_w = ihw % ow; - convCtl.input = curI + in_h * iw_stride * simdC + in_w * simdC; - convCtl.output = output + ((n * oc + ocb) * ohow + ihw * simdOc) * oBytes; - convCtl.filter = filterArray + ocb * ic * fh * fw + ocSize * icbb * fh * fw; - if ((ic % 16 != 0) && (icbb == (int)ic - icSize)) { - U32 cx = (ic % 8 == 0) ? 8 : 4; - convCtl.f8Step = - convCtl.fStep - (in_h * iw_stride + in_w) * (SIMDW - cx); - convCtl.f4Step = convCtl.fStep / 2 - (in_h * iw_stride + in_w) * (8 - 4); + if (paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { + U32 hwSize = 0; + for (U32 hw = 0; hw < ohow; hw += hwSize) { + U32 ocSize = 0; + hwSize = UNI_MIN(BLOCK_HW_DIM, ohow - hw); + for (U32 ocb = 0; ocb < oc; ocb += ocSize) { + ocSize = UNI_MIN(unrollOc, oc - ocb); + ocSize = ocSizeArray[ocSize >> 4]; + simdOc = UNI_MIN(SIMDW, ocSize); + convCtl.bias = offsetC + ocb; + UINT8 *curI = useInput + icbb * ih_stride * iw_stride; + U32 wSize = 8; + U32 unrollW = wSizeArray[ocSize >> 4]; + for (U32 ihw = hw; ihw < hw + hwSize; ihw += wSize) { + wSize = UNI_MIN(hw + hwSize - ihw, unrollW); + U32 idx = wSize * 2 / unrollW; + wSize = UNI_MAX(idx * unrollW / 2, 1); + U32 in_h = ihw / ow; + U32 in_w = ihw % ow; + convCtl.input = curI + in_h * iw_stride * simdC + in_w * simdC; + convCtl.output = + output + ((n * oc + ocb) * ohow + ihw * simdOc) * oBytes; + convCtl.eltwise = eltwiseInput + (n * oc + ocb) * ohow + ihw * simdOc; + convCtl.filter = + filterArray + ocb * ic * fh * fw + ocSize * icbb * fh * fw; + if ((ic % 16 != 0) && (icbb == (int)ic - icSize)) { + U32 cx = (ic % 8 == 0) ? 8 : 4; + convCtl.f8Step = + convCtl.fStep - (in_h * iw_stride + in_w) * (SIMDW - cx); + convCtl.f4Step = + convCtl.fStep / 2 - (in_h * iw_stride + in_w) * (8 - 4); + } + convCtl.ic = icSize; + kernel[ocSize >> 4][idx](convCtl); + } + } + } + } else { + for (U32 h = 0; h < oh; ++h) { + U32 ocSize = 0; + for (U32 ocb = 0; ocb < oc; ocb += ocSize) { + ocSize = UNI_MIN(unrollOc, oc - ocb); + ocSize = ocSizeArray[ocSize >> 4]; + simdOc = UNI_MIN(SIMDW, ocSize); + convCtl.bias = offsetC + ocb; + UINT8 *curI = useInput + icbb * ih_stride * iw_stride; + U32 wSize = 8; + U32 unrollW = wSizeArray[ocSize >> 4]; + for (U32 w = 0; w < ow; w += wSize) { + wSize = 1; + convCtl.output = + output + ((n * oc + ocb) * ohow + (h * ow + w) * simdOc) * oBytes; + convCtl.eltwise = eltwiseInput + + ((n * oc + ocb) * ohow + (h * ow + w) * simdOc) * oBytes; + // directly store activated bias + if ((h < paddingT) || (h >= ih_stride + paddingT) || (w < paddingL) || + (w >= paddingL + iw_stride)) { + if (!(flags & 0x2) && (icbb == (int)ic - icSize)) { + int oci = 0; + for (oci = 0; oci < (int)ocSize + 1 - SIMDW; oci += SIMDW) { + UNI_MEMCPY(((U8 *)convCtl.output) + ohow * oci * oBytes, + activatedBias + oci + ocb, SIMDW * oBytes); + } + for (; oci < (int)ocSize; oci += 8) { + UNI_MEMCPY(((U8 *)convCtl.output) + ohow * oci * oBytes, + activatedBias + oci + ocb, 8 * oBytes); + } + } + continue; + } + wSize = UNI_MIN(iw_stride - (w - paddingL), unrollW); + U32 idx = wSize * 2 / unrollW; + wSize = UNI_MAX(idx * unrollW / 2, 1); + + convCtl.input = + curI + (h - paddingT) * iw_stride * simdC + (w - paddingL) * simdC; + convCtl.filter = + filterArray + ocb * ic * fh * fw + ocSize * icbb * fh * fw; + if ((ic % 16 != 0) && (icbb == (int)ic - icSize)) { + U32 cx = (ic % 8 == 0) ? 8 : 4; + convCtl.f8Step = convCtl.fStep - + ((h - paddingT) * iw_stride + (w - paddingL)) * (SIMDW - cx); + convCtl.f4Step = convCtl.fStep / 2 - + ((h - paddingT) * iw_stride + (w - paddingL)) * (8 - 4); + } + convCtl.ic = icSize; + kernel[ocSize >> 4][idx](convCtl); } - convCtl.ic = icSize; - kernel[ocSize >> 4][idx](convCtl); } } } diff --git a/compute/tensor/src/cpu/x86/int8/convolution_direct.cpp b/compute/tensor/src/cpu/x86/int8/convolution_direct.cpp index 7279f7fe..ad6767e3 100644 --- a/compute/tensor/src/cpu/x86/int8/convolution_direct.cpp +++ b/compute/tensor/src/cpu/x86/int8/convolution_direct.cpp @@ -17,2179 +17,3297 @@ #include "error.h" #include "transform_functions_int8.h" #include "cpu/x86/int8/tensor_computing_int8.h" +#include "cpu/x86/int8/convolution_functions.h" #include "cpu/x86/tensor_computing_x86.h" +#include "cpu/tensor_computing_cpu.h" #define SIMDW 16 #define BLOCK_IC_DIM 128 -#define BLOCK_HW_DIM 1024 - -struct ConvController { - UINT8 *input; - const INT8 *filter; - void *output; - UINT8 *u8Output; - const I32 *bias; - I64 ic; - I64 kw; - I64 kh; - I64 stepC16; - I64 dilateW; - I64 dilateH; - I64 ostepC16; - I64 flags; - I64 fStep; - I64 f8Step; - I64 f4Step; - void *scale; -}; - -typedef void (*kernelFunc)(ConvController &c); +#define BLOCK_HW_DIM 96 // clang-format off -#define clear1Regs(rtype) \ - "vxorps "#rtype"0, "#rtype"0, "#rtype"0 \n\t" - -#define clear2Regs(rtype) \ - clear1Regs(rtype) \ - "vxorps "#rtype"1, "#rtype"1, "#rtype"1 \n\t" - -#define clear3Regs(rtype) \ - clear2Regs(rtype) \ - "vxorps "#rtype"2, "#rtype"2, "#rtype"2 \n\t" - -#define clear12Regs(rtype) \ - clear3Regs(rtype) \ - "vxorps "#rtype"3, "#rtype"3, "#rtype"3 \n\t" \ - "vxorps "#rtype"4, "#rtype"4, "#rtype"4 \n\t" \ - "vxorps "#rtype"5, "#rtype"5, "#rtype"5 \n\t" \ - "vxorps "#rtype"6, "#rtype"6, "#rtype"6 \n\t" \ - "vxorps "#rtype"7, "#rtype"7, "#rtype"7 \n\t" \ - "vxorps "#rtype"8, "#rtype"8, "#rtype"8 \n\t" \ - "vxorps "#rtype"9, "#rtype"9, "#rtype"9 \n\t" \ - "vxorps "#rtype"10, "#rtype"10, "#rtype"10 \n\t" \ - "vxorps "#rtype"11, "#rtype"11, "#rtype"11 \n\t" - -#define clear24Regs(rtype) \ - clear12Regs(rtype) \ - "vxorps "#rtype"12, "#rtype"12, "#rtype"12 \n\t" \ - "vxorps "#rtype"13, "#rtype"13, "#rtype"13 \n\t" \ - "vxorps "#rtype"14, "#rtype"14, "#rtype"14 \n\t" \ - "vxorps "#rtype"15, "#rtype"15, "#rtype"15 \n\t" \ - "vxorps "#rtype"16, "#rtype"16, "#rtype"16 \n\t" \ - "vxorps "#rtype"17, "#rtype"17, "#rtype"17 \n\t" \ - "vxorps "#rtype"18, "#rtype"18, "#rtype"18 \n\t" \ - "vxorps "#rtype"19, "#rtype"19, "#rtype"19 \n\t" \ - "vxorps "#rtype"20, "#rtype"20, "#rtype"20 \n\t" \ - "vxorps "#rtype"21, "#rtype"21, "#rtype"21 \n\t" \ - "vxorps "#rtype"22, "#rtype"22, "#rtype"22 \n\t" \ - "vxorps "#rtype"23, "#rtype"23, "#rtype"23 \n\t" - -#define reluReg(rtype) \ - "vpxord "#rtype"31, "#rtype"31, "#rtype"31 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"0, "#rtype"0 \n\t" - -#define relu2Regs(rtype) \ - reluReg(rtype) \ - "vpmaxsd "#rtype"31, "#rtype"1, "#rtype"1 \n\t" - -#define relu3Regs(rtype) \ - relu2Regs(rtype) \ - "vpmaxsd "#rtype"31, "#rtype"2, "#rtype"2 \n\t" - -#define relu12Regs(rtype) \ - relu3Regs(rtype) \ - "vpmaxsd "#rtype"31, "#rtype"3, "#rtype"3 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"4, "#rtype"4 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"5, "#rtype"5 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"6, "#rtype"6 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"7, "#rtype"7 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"8, "#rtype"8 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"9, "#rtype"9 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"10, "#rtype"10 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"11, "#rtype"11 \n\t" - -#define relu24Regs(rtype) \ - relu12Regs(rtype) \ - "vpmaxsd "#rtype"31, "#rtype"12, "#rtype"12 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"13, "#rtype"13 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"14, "#rtype"14 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"15, "#rtype"15 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"16, "#rtype"16 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"17, "#rtype"17 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"18, "#rtype"18 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"19, "#rtype"19 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"20, "#rtype"20 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"21, "#rtype"21 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"22, "#rtype"22 \n\t" \ - "vpmaxsd "#rtype"31, "#rtype"23, "#rtype"23 \n\t" - -#define convertRegI32ToF32(scalePtr, rtype) \ - "vbroadcastss ("#scalePtr"), "#rtype"24 \n\t" \ - "vcvtdq2ps "#rtype"0, "#rtype"0 \n\t" \ - "vmulps "#rtype"0, "#rtype"24, "#rtype"0 \n\t" \ - -#define convert2RegsI32ToF32(scalePtr, rtype) \ - "vbroadcastss ("#scalePtr"), "#rtype"24 \n\t" \ - "vcvtdq2ps "#rtype"0, "#rtype"0 \n\t" \ - "vcvtdq2ps "#rtype"1, "#rtype"1 \n\t" \ - "vmulps "#rtype"0, "#rtype"24, "#rtype"0 \n\t" \ - "vmulps "#rtype"1, "#rtype"24, "#rtype"1 \n\t" \ - -#define convert3RegsI32ToF32(scalePtr, rtype) \ - "vbroadcastss ("#scalePtr"), "#rtype"24 \n\t" \ - "vcvtdq2ps "#rtype"0, "#rtype"0 \n\t" \ - "vcvtdq2ps "#rtype"1, "#rtype"1 \n\t" \ - "vcvtdq2ps "#rtype"2, "#rtype"2 \n\t" \ - "vmulps "#rtype"0, "#rtype"24, "#rtype"0 \n\t" \ - "vmulps "#rtype"1, "#rtype"24, "#rtype"1 \n\t" \ - "vmulps "#rtype"2, "#rtype"24, "#rtype"2 \n\t" -#define convert12RegsI32ToF32(scalePtr, rtype) \ - "vbroadcastss ("#scalePtr"), "#rtype"24 \n\t" \ - "vcvtdq2ps "#rtype"0, "#rtype"0 \n\t" \ - "vcvtdq2ps "#rtype"1, "#rtype"1 \n\t" \ - "vcvtdq2ps "#rtype"2, "#rtype"2 \n\t" \ - "vcvtdq2ps "#rtype"3, "#rtype"3 \n\t" \ - "vcvtdq2ps "#rtype"4, "#rtype"4 \n\t" \ - "vcvtdq2ps "#rtype"5, "#rtype"5 \n\t" \ - "vcvtdq2ps "#rtype"6, "#rtype"6 \n\t" \ - "vcvtdq2ps "#rtype"7, "#rtype"7 \n\t" \ - "vcvtdq2ps "#rtype"8, "#rtype"8 \n\t" \ - "vcvtdq2ps "#rtype"9, "#rtype"9 \n\t" \ - "vcvtdq2ps "#rtype"10, "#rtype"10 \n\t" \ - "vcvtdq2ps "#rtype"11, "#rtype"11 \n\t" \ - "vmulps "#rtype"0, "#rtype"24, "#rtype"0 \n\t" \ - "vmulps "#rtype"1, "#rtype"24, "#rtype"1 \n\t" \ - "vmulps "#rtype"2, "#rtype"24, "#rtype"2 \n\t" \ - "vmulps "#rtype"3, "#rtype"24, "#rtype"3 \n\t" \ - "vmulps "#rtype"4, "#rtype"24, "#rtype"4 \n\t" \ - "vmulps "#rtype"5, "#rtype"24, "#rtype"5 \n\t" \ - "vmulps "#rtype"6, "#rtype"24, "#rtype"6 \n\t" \ - "vmulps "#rtype"7, "#rtype"24, "#rtype"7 \n\t" \ - "vmulps "#rtype"8, "#rtype"24, "#rtype"8 \n\t" \ - "vmulps "#rtype"9, "#rtype"24, "#rtype"9 \n\t" \ - "vmulps "#rtype"10, "#rtype"24, "#rtype"10 \n\t" \ - "vmulps "#rtype"11, "#rtype"24, "#rtype"11 \n\t" - -#define convert24RegsI32ToF32(scalePtr, rtype) \ - convert12RegsI32ToF32(scalePtr, rtype) \ - "vcvtdq2ps "#rtype"12, "#rtype"12 \n\t" \ - "vcvtdq2ps "#rtype"13, "#rtype"13 \n\t" \ - "vcvtdq2ps "#rtype"14, "#rtype"14 \n\t" \ - "vcvtdq2ps "#rtype"15, "#rtype"15 \n\t" \ - "vcvtdq2ps "#rtype"16, "#rtype"16 \n\t" \ - "vcvtdq2ps "#rtype"17, "#rtype"17 \n\t" \ - "vcvtdq2ps "#rtype"18, "#rtype"18 \n\t" \ - "vcvtdq2ps "#rtype"19, "#rtype"19 \n\t" \ - "vcvtdq2ps "#rtype"20, "#rtype"20 \n\t" \ - "vcvtdq2ps "#rtype"21, "#rtype"21 \n\t" \ - "vcvtdq2ps "#rtype"22, "#rtype"22 \n\t" \ - "vcvtdq2ps "#rtype"23, "#rtype"23 \n\t" \ - "vmulps "#rtype"12, "#rtype"24, "#rtype"12 \n\t" \ - "vmulps "#rtype"13, "#rtype"24, "#rtype"13 \n\t" \ - "vmulps "#rtype"14, "#rtype"24, "#rtype"14 \n\t" \ - "vmulps "#rtype"15, "#rtype"24, "#rtype"15 \n\t" \ - "vmulps "#rtype"16, "#rtype"24, "#rtype"16 \n\t" \ - "vmulps "#rtype"17, "#rtype"24, "#rtype"17 \n\t" \ - "vmulps "#rtype"18, "#rtype"24, "#rtype"18 \n\t" \ - "vmulps "#rtype"19, "#rtype"24, "#rtype"19 \n\t" \ - "vmulps "#rtype"20, "#rtype"24, "#rtype"20 \n\t" \ - "vmulps "#rtype"21, "#rtype"24, "#rtype"21 \n\t" \ - "vmulps "#rtype"22, "#rtype"24, "#rtype"22 \n\t" \ - "vmulps "#rtype"23, "#rtype"24, "#rtype"23 \n\t" -#define load48BiasTo3Regs(bias) \ - "vmovups ("#bias"), %%zmm0 \n\t" \ - "vmovups 0x40("#bias"), %%zmm1 \n\t" \ - "vmovups 0x80("#bias"), %%zmm2 \n\t" \ - -#define load48BiasTo12Regs(bias) \ - load48BiasTo3Regs(bias) \ - "vmovups %%zmm0, %%zmm3 \n\t" \ - "vmovups %%zmm1, %%zmm4 \n\t" \ - "vmovups %%zmm2, %%zmm5 \n\t" \ - "vmovups %%zmm0, %%zmm6 \n\t" \ - "vmovups %%zmm1, %%zmm7 \n\t" \ - "vmovups %%zmm2, %%zmm8 \n\t" \ - "vmovups %%zmm0, %%zmm9 \n\t" \ - "vmovups %%zmm1, %%zmm10 \n\t" \ - "vmovups %%zmm2, %%zmm11 \n\t" - -#define load48BiasTo24Regs(bias) \ - load48BiasTo12Regs(bias) \ - "vmovups %%zmm0, %%zmm12 \n\t" \ - "vmovups %%zmm1, %%zmm13 \n\t" \ - "vmovups %%zmm2, %%zmm14 \n\t" \ - "vmovups %%zmm0, %%zmm15 \n\t" \ - "vmovups %%zmm1, %%zmm16 \n\t" \ - "vmovups %%zmm2, %%zmm17 \n\t" \ - "vmovups %%zmm0, %%zmm18 \n\t" \ - "vmovups %%zmm1, %%zmm19 \n\t" \ - "vmovups %%zmm2, %%zmm20 \n\t" \ - "vmovups %%zmm0, %%zmm21 \n\t" \ - "vmovups %%zmm1, %%zmm22 \n\t" \ - "vmovups %%zmm2, %%zmm23 \n\t" - #ifdef _USE_AVX512_VNNI -#define convKernel8x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm31 \n\t" \ - "vpdpbusd "#freg0", %%zmm30, %%zmm0 \n\t" \ - "vpdpbusd "#freg1", %%zmm30, %%zmm1 \n\t" \ - "vpdpbusd "#freg2", %%zmm30, %%zmm2 \n\t" \ - "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ - "vpdpbusd "#freg0", %%zmm31, %%zmm3 \n\t" \ - "vpdpbusd "#freg1", %%zmm31, %%zmm4 \n\t" \ - "vpdpbusd "#freg2", %%zmm31, %%zmm5 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm31 \n\t" \ - "vpdpbusd "#freg0", %%zmm30, %%zmm6 \n\t" \ - "vpdpbusd "#freg1", %%zmm30, %%zmm7 \n\t" \ - "vpdpbusd "#freg2", %%zmm30, %%zmm8 \n\t" \ - "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ - "vpdpbusd "#freg0", %%zmm31, %%zmm9 \n\t" \ - "vpdpbusd "#freg1", %%zmm31, %%zmm10 \n\t" \ - "vpdpbusd "#freg2", %%zmm31, %%zmm11 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm31 \n\t" \ - "vpdpbusd "#freg0", %%zmm30, %%zmm12 \n\t" \ - "vpdpbusd "#freg1", %%zmm30, %%zmm13 \n\t" \ - "vpdpbusd "#freg2", %%zmm30, %%zmm14 \n\t" \ - "vmovups "#off2"(%[filter]), "#preg2" \n\t" \ - "vpdpbusd "#freg0", %%zmm31, %%zmm15 \n\t" \ - "vpdpbusd "#freg1", %%zmm31, %%zmm16 \n\t" \ - "vpdpbusd "#freg2", %%zmm31, %%zmm17 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm31 \n\t" \ - "vpdpbusd "#freg0", %%zmm30, %%zmm18 \n\t" \ - "vpdpbusd "#freg1", %%zmm30, %%zmm19 \n\t" \ - "vpdpbusd "#freg2", %%zmm30, %%zmm20 \n\t" \ - "vpdpbusd "#freg0", %%zmm31, %%zmm21 \n\t" \ - "vpdpbusd "#freg1", %%zmm31, %%zmm22 \n\t" \ - "vpdpbusd "#freg2", %%zmm31, %%zmm23 \n\t" - -#define convKernel4x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm31 \n\t" \ - "vpdpbusd "#freg0", %%zmm30, %%zmm0 \n\t" \ - "vpdpbusd "#freg1", %%zmm30, %%zmm1 \n\t" \ - "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ - "vpdpbusd "#freg2", %%zmm30, %%zmm2 \n\t" \ - "vpdpbusd "#freg0", %%zmm31, %%zmm3 \n\t" \ - "vpdpbusd "#freg1", %%zmm31, %%zmm4 \n\t" \ - "vpdpbusd "#freg2", %%zmm31, %%zmm5 \n\t" \ - "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm31 \n\t" \ - "vpdpbusd "#freg0", %%zmm30, %%zmm6 \n\t" \ - "vpdpbusd "#freg1", %%zmm30, %%zmm7 \n\t" \ - "vpdpbusd "#freg2", %%zmm30, %%zmm8 \n\t" \ - "vpdpbusd "#freg0", %%zmm31, %%zmm9 \n\t" \ - "vmovups "#off2"(%[filter]), "#preg2" \n\t" \ - "vpdpbusd "#freg1", %%zmm31, %%zmm10 \n\t" \ - "vpdpbusd "#freg2", %%zmm31, %%zmm11 \n\t" - -#define convKernel1x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ - "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ - "vmovups "#off2"(%[filter]), "#preg2" \n\t" \ - "vpdpbusd "#freg0", %%zmm30, %%zmm0 \n\t" \ - "vpdpbusd "#freg1", %%zmm30, %%zmm1 \n\t" \ - "vpdpbusd "#freg2", %%zmm30, %%zmm2 \n\t" +#define convKernel8x48c4_1(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ + "movq (%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm0 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm1 \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm2 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x8(%[stepC16]), "#input" \n\t" \ + "movq 0x10(%[stepC16]), %%r10 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm3 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm4 \n\t" \ + "vpdpbusd "#freg2", %%zmm31, %%zmm5 \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm6 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm7 \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm8 \n\t" \ + "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x18(%[stepC16]), "#input" \n\t" \ + "movq 0x20(%[stepC16]), %%r10 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm9 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm10 \n\t" \ + "vpdpbusd "#freg2", %%zmm31, %%zmm11 \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm12 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm13 \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm14 \n\t" \ + "vmovups "#off2"(%[filter]), "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x28(%[stepC16]), "#input" \n\t" \ + "movq 0x30(%[stepC16]), %%r10 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm15 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm16 \n\t" \ + "vpdpbusd "#freg2", %%zmm31, %%zmm17 \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm18 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm19 \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm20 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm21 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm22 \n\t" \ + "vpdpbusd "#freg2", %%zmm31, %%zmm23 \n\t" + +#define convKernel4x48c4_1(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ + "movq (%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm0 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm1 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x8(%[stepC16]), "#input" \n\t" \ + "movq 0x10(%[stepC16]), %%r10 \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm2 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm3 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm4 \n\t" \ + "vpdpbusd "#freg2", %%zmm31, %%zmm5 \n\t" \ + "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm6 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm7 \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm8 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm9 \n\t" \ + "vmovups "#off2"(%[filter]), "#preg2" \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm10 \n\t" \ + "vpdpbusd "#freg2", %%zmm31, %%zmm11 \n\t" + +#define convKernel1x48c4_1(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ + "vmovups "#off2"(%[filter]), "#preg2" \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm0 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm1 \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm2 \n\t" + +#define convKernel8x48c4_0(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm0 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm1 \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm2 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm3 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm4 \n\t" \ + "vpdpbusd "#freg2", %%zmm31, %%zmm5 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm6 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm7 \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm8 \n\t" \ + "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm9 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm10 \n\t" \ + "vpdpbusd "#freg2", %%zmm31, %%zmm11 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm12 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm13 \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm14 \n\t" \ + "vmovups "#off2"(%[filter]), "#preg2" \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm15 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm16 \n\t" \ + "vpdpbusd "#freg2", %%zmm31, %%zmm17 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm18 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm19 \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm20 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm21 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm22 \n\t" \ + "vpdpbusd "#freg2", %%zmm31, %%zmm23 \n\t" + +#define convKernel4x48c4_0(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm0 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm1 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm2 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm3 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm4 \n\t" \ + "vpdpbusd "#freg2", %%zmm31, %%zmm5 \n\t" \ + "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm6 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm7 \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm8 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm9 \n\t" \ + "vmovups "#off2"(%[filter]), "#preg2" \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm10 \n\t" \ + "vpdpbusd "#freg2", %%zmm31, %%zmm11 \n\t" + +#define convKernel1x48c4_0(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ + "vmovups "#off2"(%[filter]), "#preg2" \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm0 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm1 \n\t" \ + "vpdpbusd "#freg2", %%zmm30, %%zmm2 \n\t" + #else + #define convKernel8x48c4_3(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ - "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ - "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm3, "#preg0", %%zmm3 \n\t" \ - "vpaddd %%zmm4, "#preg1", %%zmm4 \n\t" \ - "vpaddd %%zmm5, "#preg2", %%zmm5 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm6, "#preg0", %%zmm6 \n\t" \ - "vpaddd %%zmm7, "#preg1", %%zmm7 \n\t" \ - "vpaddd %%zmm8, "#preg2", %%zmm8 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm9, "#preg0", %%zmm9 \n\t" \ - "vpaddd %%zmm10, "#preg1", %%zmm10 \n\t" \ - "vpaddd %%zmm11, "#preg2", %%zmm11 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm12, "#preg0", %%zmm12 \n\t" \ - "vpaddd %%zmm13, "#preg1", %%zmm13 \n\t" \ - "vpaddd %%zmm14, "#preg2", %%zmm14 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm15, "#preg0", %%zmm15 \n\t" \ - "vpaddd %%zmm16, "#preg1", %%zmm16 \n\t" \ - "vpaddd %%zmm17, "#preg2", %%zmm17 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm18, "#preg0", %%zmm18 \n\t" \ - "vpaddd %%zmm19, "#preg1", %%zmm19 \n\t" \ - "vpaddd %%zmm20, "#preg2", %%zmm20 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vmovups "#off2"(%[filter]), "#freg2" \n\t" \ - "vpaddd %%zmm21, "#preg0", %%zmm21 \n\t" \ - "vpaddd %%zmm22, "#preg1", %%zmm22 \n\t" \ - "vpaddd %%zmm23, "#preg2", %%zmm23 \n\t" + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ + "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ + "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm3, "#preg0", %%zmm3 \n\t" \ + "vpaddd %%zmm4, "#preg1", %%zmm4 \n\t" \ + "vpaddd %%zmm5, "#preg2", %%zmm5 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm6, "#preg0", %%zmm6 \n\t" \ + "vpaddd %%zmm7, "#preg1", %%zmm7 \n\t" \ + "vpaddd %%zmm8, "#preg2", %%zmm8 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm9, "#preg0", %%zmm9 \n\t" \ + "vpaddd %%zmm10, "#preg1", %%zmm10 \n\t" \ + "vpaddd %%zmm11, "#preg2", %%zmm11 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm12, "#preg0", %%zmm12 \n\t" \ + "vpaddd %%zmm13, "#preg1", %%zmm13 \n\t" \ + "vpaddd %%zmm14, "#preg2", %%zmm14 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm15, "#preg0", %%zmm15 \n\t" \ + "vpaddd %%zmm16, "#preg1", %%zmm16 \n\t" \ + "vpaddd %%zmm17, "#preg2", %%zmm17 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm18, "#preg0", %%zmm18 \n\t" \ + "vpaddd %%zmm19, "#preg1", %%zmm19 \n\t" \ + "vpaddd %%zmm20, "#preg2", %%zmm20 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vmovups "#off2"(%[filter]), "#freg2" \n\t" \ + "vpaddd %%zmm21, "#preg0", %%zmm21 \n\t" \ + "vpaddd %%zmm22, "#preg1", %%zmm22 \n\t" \ + "vpaddd %%zmm23, "#preg2", %%zmm23 \n\t" #define convKernel4x48c4_3(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ - "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ - "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm3, "#preg0", %%zmm3 \n\t" \ - "vpaddd %%zmm4, "#preg1", %%zmm4 \n\t" \ - "vpaddd %%zmm5, "#preg2", %%zmm5 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpaddd %%zmm6, "#preg0", %%zmm6 \n\t" \ - "vpaddd %%zmm7, "#preg1", %%zmm7 \n\t" \ - "vpaddd %%zmm8, "#preg2", %%zmm8 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vmovups "#off2"(%[filter]), "#freg2" \n\t" \ - "vpaddd %%zmm9, "#preg0", %%zmm9 \n\t" \ - "vpaddd %%zmm10, "#preg1", %%zmm10 \n\t" \ - "vpaddd %%zmm11, "#preg2", %%zmm11 \n\t" + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ + "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ + "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm3, "#preg0", %%zmm3 \n\t" \ + "vpaddd %%zmm4, "#preg1", %%zmm4 \n\t" \ + "vpaddd %%zmm5, "#preg2", %%zmm5 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm6, "#preg0", %%zmm6 \n\t" \ + "vpaddd %%zmm7, "#preg1", %%zmm7 \n\t" \ + "vpaddd %%zmm8, "#preg2", %%zmm8 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vmovups "#off2"(%[filter]), "#freg2" \n\t" \ + "vpaddd %%zmm9, "#preg0", %%zmm9 \n\t" \ + "vpaddd %%zmm10, "#preg1", %%zmm10 \n\t" \ + "vpaddd %%zmm11, "#preg2", %%zmm11 \n\t" #define convKernel1x48c4_3(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vmovups "#off2"(%[filter]), "#freg2" \n\t" \ - "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ - "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ - "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" - -#define convKernel8x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ - convKernel8x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, off0, off1, off2, %%zmm27, %%zmm28, %%zmm29) - -#define convKernel4x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ - convKernel4x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, off0, off1, off2, %%zmm27, %%zmm28, %%zmm29) + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vmovups "#off2"(%[filter]), "#freg2" \n\t" \ + "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ + "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ + "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" + +#define convKernel8x48c4_4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "addq (%[stepC16]), "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ + "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ + "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "addq 0x8(%[stepC16]), "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm3, "#preg0", %%zmm3 \n\t" \ + "vpaddd %%zmm4, "#preg1", %%zmm4 \n\t" \ + "vpaddd %%zmm5, "#preg2", %%zmm5 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "addq 0x10(%[stepC16]), "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm6, "#preg0", %%zmm6 \n\t" \ + "vpaddd %%zmm7, "#preg1", %%zmm7 \n\t" \ + "vpaddd %%zmm8, "#preg2", %%zmm8 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "addq 0x18(%[stepC16]), "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm9, "#preg0", %%zmm9 \n\t" \ + "vpaddd %%zmm10, "#preg1", %%zmm10 \n\t" \ + "vpaddd %%zmm11, "#preg2", %%zmm11 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "addq 0x20(%[stepC16]), "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm12, "#preg0", %%zmm12 \n\t" \ + "vpaddd %%zmm13, "#preg1", %%zmm13 \n\t" \ + "vpaddd %%zmm14, "#preg2", %%zmm14 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "addq 0x28(%[stepC16]), "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm15, "#preg0", %%zmm15 \n\t" \ + "vpaddd %%zmm16, "#preg1", %%zmm16 \n\t" \ + "vpaddd %%zmm17, "#preg2", %%zmm17 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "addq 0x30(%[stepC16]), "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm18, "#preg0", %%zmm18 \n\t" \ + "vpaddd %%zmm19, "#preg1", %%zmm19 \n\t" \ + "vpaddd %%zmm20, "#preg2", %%zmm20 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vmovups "#off2"(%[filter]), "#freg2" \n\t" \ + "vpaddd %%zmm21, "#preg0", %%zmm21 \n\t" \ + "vpaddd %%zmm22, "#preg1", %%zmm22 \n\t" \ + "vpaddd %%zmm23, "#preg2", %%zmm23 \n\t" + +#define convKernel4x48c4_4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "addq (%[stepC16]), "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ + "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ + "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "addq 0x8(%[stepC16]), "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm3, "#preg0", %%zmm3 \n\t" \ + "vpaddd %%zmm4, "#preg1", %%zmm4 \n\t" \ + "vpaddd %%zmm5, "#preg2", %%zmm5 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "addq 0x10(%[stepC16]), "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpaddd %%zmm6, "#preg0", %%zmm6 \n\t" \ + "vpaddd %%zmm7, "#preg1", %%zmm7 \n\t" \ + "vpaddd %%zmm8, "#preg2", %%zmm8 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vmovups "#off2"(%[filter]), "#freg2" \n\t" \ + "vpaddd %%zmm9, "#preg0", %%zmm9 \n\t" \ + "vpaddd %%zmm10, "#preg1", %%zmm10 \n\t" \ + "vpaddd %%zmm11, "#preg2", %%zmm11 \n\t" + +#define convKernel1x48c4_4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg2", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vmovups "#off2"(%[filter]), "#freg2" \n\t" \ + "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ + "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ + "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" + +#define convKernel8x48c4_0(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ + convKernel8x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, \ + off0, off1, off2, %%zmm27, %%zmm28, %%zmm29) + +#define convKernel4x48c4_0(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ + convKernel4x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, \ + off0, off1, off2, %%zmm27, %%zmm28, %%zmm29) + +#define convKernel1x48c4_0(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ + convKernel1x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, \ + off0, off1, off2, %%zmm27, %%zmm28, %%zmm29) + +#define convKernel8x48c4_1(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ + convKernel8x48c4_4(input, %%zmm24, %%zmm25, %%zmm26, \ + off0, off1, off2, %%zmm27, %%zmm28, %%zmm29) + +#define convKernel4x48c4_1(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ + convKernel4x48c4_4(input, %%zmm24, %%zmm25, %%zmm26, \ + off0, off1, off2, %%zmm27, %%zmm28, %%zmm29) + +#define convKernel1x48c4_1(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ + convKernel1x48c4_4(input, %%zmm24, %%zmm25, %%zmm26, \ + off0, off1, off2, %%zmm27, %%zmm28, %%zmm29) -#define convKernel1x48c4(input, freg0, freg1, freg2, off0, off1, off2, preg0, preg1, preg2) \ - convKernel1x48c4_3(input, %%zmm24, %%zmm25, %%zmm26, off0, off1, off2, %%zmm27, %%zmm28, %%zmm29) #endif -#define convKernelForLoopXx48(rnum, wsize) \ - __asm__ __volatile__("vmovups (%[filter]), %%zmm24 \n\t" \ - "vmovups 0x40(%[filter]), %%zmm25 \n\t" \ - "vmovups 0x80(%[filter]), %%zmm26 \n\t" \ - "addq $0xC0, %[filter] \n\t" \ - "mov $1, %%eax \n\t" \ - "vmovd %%eax, %%xmm0 \n\t" \ - "vpbroadcastw %%xmm0, %%zmm31 \n\t" \ - "movq %[flags], %%rax \n\t" \ - "andq $0x1, %%rax \n\t" \ - "jne 0f \n\t" \ - load48BiasTo##rnum##Regs(%[bias]) \ - "cmpq $0x10, %%rcx \n\t" \ - "jl 4f \n\t" \ - "jmp 1f \n\t" \ - ".align 16 \n\t" \ - "0: \n\t" \ - clear##rnum##Regs(%%zmm) \ - "cmpq $0x10, %%rcx \n\t" \ - "jl 4f \n\t" \ - ".align 16 \n\t" \ - "1: \n\t" \ - "mov %[kh], %%rbx \n\t" \ - ".align 16 \n\t" \ - "2: \n\t" \ - "mov %[kw], %%r9 \n\t" \ - ".align 16 \n\t" \ - "3: \n\t" \ - "movq %[input], %%rax \n\t" \ - convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26, 0x0, 0x40, 0x80, %%zmm27, %%zmm28, %%zmm29) \ - "movq %[input], %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - convKernel##wsize##x48c4(%%rax, %%zmm27, %%zmm28, %%zmm29, 0xC0, 0x100, 0x140, %%zmm24, %%zmm25, %%zmm26) \ - "movq %[input], %%rax \n\t" \ - "addq $0x8, %%rax \n\t" \ - convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26, 0x180, 0x1C0, 0x200, %%zmm27, %%zmm28, %%zmm29) \ - "movq %[input], %%rax \n\t" \ - "addq $0xC, %%rax \n\t" \ - convKernel##wsize##x48c4(%%rax, %%zmm27, %%zmm28, %%zmm29, 0x240, 0x280, 0x2C0, %%zmm24, %%zmm25, %%zmm26) \ - "addq $0x300, %[filter] \n\t" \ - "addq %[dilateW], %[input] \n\t" \ - "dec %%r9 \n\t" \ - "jg 3b \n\t" \ - "addq %[dilateH], %[input] \n\t" \ - "dec %%rbx \n\t" \ - "jg 2b \n\t" \ - "addq %[fStep], %[input] \n\t" \ - "subq $0x10, %%rcx \n\t" \ - "cmpq $0x10, %%rcx \n\t" \ - "jge 1b \n\t" \ - "subq %[fStep], %[input] \n\t" \ - "addq %[f8Step], %[input] \n\t" \ - ".align 16 \n\t" \ - "4: \n\t" \ - : "+c" (c.ic), [input] "+r" (c.input), [filter] "+r" (c.filter) \ - : [bias] "r" (c.bias), [kh] "r" (c.kh), [kw] "r" (c.kw), \ - [stepC16] "r" (c.stepC16), [dilateW] "r" (c.dilateW), \ - [dilateH] "r" (c.dilateH), [fStep] "r" (c.fStep), [flags] "r" (c.flags), \ - [f8Step] "r" (c.f8Step) \ - : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \ - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", \ - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \ - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \ - "%zmm31", "memory", "cc"); \ - if (c.ic > 0) { \ - __asm__ __volatile__("cmpq $0x8, %%rcx \n\t" \ - "jl 2f \n\t" \ - "subq $0x8, %%rcx \n\t" \ - "shr $1, %[dilateW] \n\t" \ - "shr $1, %[dilateH] \n\t" \ - "shr $1, %[fStep] \n\t" \ - "shr $1, %[stepC16] \n\t" \ - "mov %[kh], %%rbx \n\t" \ - ".align 16 \n\t" \ - "0: \n\t" \ - "mov %[kw], %%r9 \n\t" \ - ".align 16 \n\t" \ - "1: \n\t" \ - "movq %[input], %%rax \n\t" \ - convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26, 0x0, 0x40, 0x80, %%zmm27, %%zmm28, %%zmm29) \ - "movq %[input], %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - convKernel##wsize##x48c4(%%rax, %%zmm27, %%zmm28, %%zmm29, 0xC0, 0x100, 0x140, %%zmm24, %%zmm25, %%zmm26) \ - "addq $0x180, %[filter] \n\t" \ - "addq %[dilateW], %[input] \n\t" \ - "dec %%r9 \n\t" \ - "jg 1b \n\t" \ - "addq %[dilateH], %[input] \n\t" \ - "dec %%rbx \n\t" \ - "jg 0b \n\t" \ - "addq %[f4Step], %[input] \n\t" \ - ".align 16 \n\t" \ - "2: \n\t" \ - "cmpq $0x4, %%rcx \n\t" \ - "jl 5f \n\t" \ - "shr $1, %[dilateW] \n\t" \ - "shr $1, %[dilateH] \n\t" \ - "shr $1, %[stepC16] \n\t" \ - "mov %[kh], %%rbx \n\t" \ - ".align 16 \n\t" \ - "3: \n\t" \ - "mov %[kw], %%r9 \n\t" \ - ".align 16 \n\t" \ - "4: \n\t" \ - "movq %[input], %%rax \n\t" \ - convKernel##wsize##x48c4(%%rax, %%zmm24, %%zmm25, %%zmm26, 0x0, 0x40, 0x80, %%zmm27, %%zmm28, %%zmm29) \ - "addq $0xC0, %[filter] \n\t" \ - "addq %[dilateW], %[input] \n\t" \ - "dec %%r9 \n\t" \ - "jg 4b \n\t" \ - "addq %[dilateH], %[input] \n\t" \ - "dec %%rbx \n\t" \ - "jg 3b \n\t" \ - ".align 16 \n\t" \ - "5: \n\t" \ - : "+c" (c.ic) \ - : [input] "r" (c.input), [filter] "r" (c.filter), [bias] "r" (c.bias), [kh] "r" (c.kh), [kw] "r" (c.kw), \ - [stepC16] "r" (c.stepC16), [dilateW] "r" (c.dilateW), \ - [dilateH] "r" (c.dilateH), [fStep] "r" (c.fStep), \ - [f4Step] "r" (c.f4Step) \ - : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \ - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", \ - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \ - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \ - "%zmm31", "memory", "cc"); \ +#define convKernelForLoopXx48(rnum, wsize, cross) \ + __asm__ __volatile__("vmovups (%[filter]), %%zmm24 \n\t" \ + "vmovups 0x40(%[filter]), %%zmm25 \n\t" \ + "vmovups 0x80(%[filter]), %%zmm26 \n\t" \ + "addq $0xC0, %[filter] \n\t" \ + "mov $1, %%eax \n\t" \ + "vmovd %%eax, %%xmm0 \n\t" \ + "vpbroadcastw %%xmm0, %%zmm31 \n\t" \ + "movq %[flags], %%rax \n\t" \ + "andq $0x1, %%rax \n\t" \ + "jne 0f \n\t" \ + load48BiasTo##rnum##Regs(%[bias]) \ + "jmp 1f \n\t" \ + ".align 16 \n\t" \ + "0: \n\t" \ + clear##rnum##Regs(%%zmm) \ + ".align 16 \n\t" \ + "1: \n\t" \ + : [filter] "+r" (c.filter) \ + : [bias] "r" (c.bias), \ + [flags] "r" (c.flags) \ + : "%rax", \ + "%zmm0", "%zmm1","%zmm2", "%zmm3", "%zmm4", "%zmm5", \ + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", \ + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \ + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \ + "%zmm24", "%zmm25", "%zmm26", "memory", "cc"); \ + if (c.ic >= 16) { \ + __asm__ __volatile__("movq (%[stepC16]), %%r10 \n\t" \ + ".align 16 \n\t" \ + "1: \n\t" \ + "mov %[kh], %%rbx \n\t" \ + ".align 16 \n\t" \ + "2: \n\t" \ + "mov %[kw], %%r9 \n\t" \ + ".align 16 \n\t" \ + "3: \n\t" \ + "movq %[input], %%rax \n\t" \ + convKernel##wsize##x48c4_##cross(%%rax, %%zmm24, %%zmm25, %%zmm26, \ + 0x0, 0x40, 0x80, %%zmm27, %%zmm28, %%zmm29) \ + "movq %[input], %%rax \n\t" \ + "addq $0x4, %%rax \n\t" \ + convKernel##wsize##x48c4_##cross(%%rax, %%zmm27, %%zmm28, %%zmm29, \ + 0xC0, 0x100, 0x140, %%zmm24, %%zmm25, %%zmm26) \ + "movq %[input], %%rax \n\t" \ + "addq $0x8, %%rax \n\t" \ + convKernel##wsize##x48c4_##cross(%%rax, %%zmm24, %%zmm25, %%zmm26, \ + 0x180, 0x1C0, 0x200, %%zmm27, %%zmm28, %%zmm29) \ + "movq %[input], %%rax \n\t" \ + "addq $0xC, %%rax \n\t" \ + convKernel##wsize##x48c4_##cross(%%rax, %%zmm27, %%zmm28, %%zmm29, \ + 0x240, 0x280, 0x2C0, %%zmm24, %%zmm25, %%zmm26) \ + "addq $0x300, %[filter] \n\t" \ + "addq %[dilateW], %[input] \n\t" \ + "dec %%r9 \n\t" \ + "jg 3b \n\t" \ + "addq %[dilateH], %[input] \n\t" \ + "dec %%rbx \n\t" \ + "jg 2b \n\t" \ + "addq %[fStep], %[input] \n\t" \ + "subq $0x10, %%rcx \n\t" \ + "cmpq $0x10, %%rcx \n\t" \ + "jge 1b \n\t" \ + "subq %[fStep], %[input] \n\t" \ + "addq %[f8Step], %[input] \n\t" \ + ".align 16 \n\t" \ + "4: \n\t" \ + : "+c" (c.ic), \ + [input] "+r" (c.input), \ + [filter] "+r" (c.filter) \ + : [kh] "r" (c.kh), \ + [kw] "r" (c.kw), \ + [stepC16] "r" (c.stepC16), \ + [dilateW] "r" (c.dilateW), \ + [dilateH] "r" (c.dilateH), \ + [fStep] "r" (c.fStep), \ + [f8Step] "r" (c.f8Step) \ + : "%rax", "%rbx", "%r9", "%r10", \ + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", \ + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", \ + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \ + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \ + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", \ + "%zmm30", "%zmm31", "memory", "cc"); \ + } \ + if (c.ic > 0) { \ + __asm__ __volatile__("cmpq $0x8, %%rcx \n\t" \ + "jl 2f \n\t" \ + "subq $0x8, %%rcx \n\t" \ + "shr $1, %[dilateW] \n\t" \ + "shr $1, %[dilateH] \n\t" \ + "shr $1, %[fStep] \n\t" \ + "addq $192, %[stepC16] \n\t" \ + "mov %[kh], %%rbx \n\t" \ + ".align 16 \n\t" \ + "0: \n\t" \ + "mov %[kw], %%r9 \n\t" \ + ".align 16 \n\t" \ + "1: \n\t" \ + "movq %[input], %%rax \n\t" \ + convKernel##wsize##x48c4_##cross(%%rax, %%zmm24, %%zmm25, %%zmm26, \ + 0x0, 0x40, 0x80, %%zmm27, %%zmm28, %%zmm29) \ + "movq %[input], %%rax \n\t" \ + "addq $0x4, %%rax \n\t" \ + convKernel##wsize##x48c4_##cross(%%rax, %%zmm27, %%zmm28, %%zmm29, \ + 0xC0, 0x100, 0x140, %%zmm24, %%zmm25, %%zmm26) \ + "addq $0x180, %[filter] \n\t" \ + "addq %[dilateW], %[input] \n\t" \ + "dec %%r9 \n\t" \ + "jg 1b \n\t" \ + "addq %[dilateH], %[input] \n\t" \ + "dec %%rbx \n\t" \ + "jg 0b \n\t" \ + "addq %[f4Step], %[input] \n\t" \ + ".align 16 \n\t" \ + "2: \n\t" \ + "cmpq $0x4, %%rcx \n\t" \ + "jl 5f \n\t" \ + "shr $1, %[dilateW] \n\t" \ + "shr $1, %[dilateH] \n\t" \ + "addq $192, %[stepC16] \n\t" \ + "mov %[kh], %%rbx \n\t" \ + ".align 16 \n\t" \ + "3: \n\t" \ + "mov %[kw], %%r9 \n\t" \ + ".align 16 \n\t" \ + "4: \n\t" \ + "movq %[input], %%rax \n\t" \ + convKernel##wsize##x48c4_##cross(%%rax, %%zmm24, %%zmm25, %%zmm26, \ + 0x0, 0x40, 0x80, %%zmm27, %%zmm28, %%zmm29) \ + "addq $0xC0, %[filter] \n\t" \ + "addq %[dilateW], %[input] \n\t" \ + "dec %%r9 \n\t" \ + "jg 4b \n\t" \ + "addq %[dilateH], %[input] \n\t" \ + "dec %%rbx \n\t" \ + "jg 3b \n\t" \ + ".align 16 \n\t" \ + "5: \n\t" \ + : "+c" (c.ic) \ + : [input] "r" (c.input), \ + [filter] "r" (c.filter), \ + [kh] "r" (c.kh), \ + [kw] "r" (c.kw), \ + [stepC16] "r" (c.stepC16), \ + [dilateW] "r" (c.dilateW), \ + [dilateH] "r" (c.dilateH), \ + [fStep] "r" (c.fStep), \ + [f4Step] "r" (c.f4Step) \ + : "%rax", "%rbx", "%r9", \ + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", \ + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", \ + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \ + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \ + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", \ + "%zmm30", "%zmm31", "memory", "cc"); \ } void Avx512ConvKernel8x48(ConvController &c) { - convKernelForLoopXx48(24, 8) - - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" - "je 0f \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd 0x40(%%rax), %%zmm3, %%zmm3 \n\t" - "vpaddd 0x80(%%rax), %%zmm6, %%zmm6 \n\t" - "vpaddd 0xC0(%%rax), %%zmm9, %%zmm9 \n\t" - "vpaddd 0x100(%%rax), %%zmm12, %%zmm12 \n\t" - "vpaddd 0x140(%%rax), %%zmm15, %%zmm15 \n\t" - "vpaddd 0x180(%%rax), %%zmm18, %%zmm18 \n\t" - "vpaddd 0x1C0(%%rax), %%zmm21, %%zmm21 \n\t" - "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" - "vpaddd 0x40(%%rax, %%rbx), %%zmm4, %%zmm4 \n\t" - "vpaddd 0x80(%%rax, %%rbx), %%zmm7, %%zmm7 \n\t" - "vpaddd 0xC0(%%rax, %%rbx), %%zmm10, %%zmm10 \n\t" - "vpaddd 0x100(%%rax, %%rbx), %%zmm13, %%zmm13 \n\t" - "vpaddd 0x140(%%rax, %%rbx), %%zmm16, %%zmm16 \n\t" - "vpaddd 0x180(%%rax, %%rbx), %%zmm19, %%zmm19 \n\t" - "vpaddd 0x1C0(%%rax, %%rbx), %%zmm22, %%zmm22 \n\t" - "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2 \n\t" - "vpaddd 0x40(%%rax, %%rbx, 2), %%zmm5, %%zmm5 \n\t" - "vpaddd 0x80(%%rax, %%rbx, 2), %%zmm8, %%zmm8 \n\t" - "vpaddd 0xC0(%%rax, %%rbx, 2), %%zmm11, %%zmm11 \n\t" - "vpaddd 0x100(%%rax, %%rbx, 2), %%zmm14, %%zmm14 \n\t" - "vpaddd 0x140(%%rax, %%rbx, 2), %%zmm17, %%zmm17 \n\t" - "vpaddd 0x180(%%rax, %%rbx, 2), %%zmm20, %%zmm20 \n\t" - "vpaddd 0x1C0(%%rax, %%rbx, 2), %%zmm23, %%zmm23 \n\t" + if (c.cross) { + convKernelForLoopXx48(24, 8, 1) + } else { + convKernelForLoopXx48(24, 8, 0) + } - ".align 16 \n\t" - "0: \n\t" - "movq %[flags], %%rcx \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" + "je 0f \n\t" + "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" + "vpaddd 0x40(%%rax), %%zmm3, %%zmm3 \n\t" + "vpaddd 0x80(%%rax), %%zmm6, %%zmm6 \n\t" + "vpaddd 0xC0(%%rax), %%zmm9, %%zmm9 \n\t" + "vpaddd 0x100(%%rax), %%zmm12, %%zmm12 \n\t" + "vpaddd 0x140(%%rax), %%zmm15, %%zmm15 \n\t" + "vpaddd 0x180(%%rax), %%zmm18, %%zmm18 \n\t" + "vpaddd 0x1C0(%%rax), %%zmm21, %%zmm21 \n\t" + "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" + "vpaddd 0x40(%%rax, %%rbx), %%zmm4, %%zmm4 \n\t" + "vpaddd 0x80(%%rax, %%rbx), %%zmm7, %%zmm7 \n\t" + "vpaddd 0xC0(%%rax, %%rbx), %%zmm10, %%zmm10 \n\t" + "vpaddd 0x100(%%rax, %%rbx), %%zmm13, %%zmm13 \n\t" + "vpaddd 0x140(%%rax, %%rbx), %%zmm16, %%zmm16 \n\t" + "vpaddd 0x180(%%rax, %%rbx), %%zmm19, %%zmm19 \n\t" + "vpaddd 0x1C0(%%rax, %%rbx), %%zmm22, %%zmm22 \n\t" + "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2 \n\t" + "vpaddd 0x40(%%rax, %%rbx, 2), %%zmm5, %%zmm5 \n\t" + "vpaddd 0x80(%%rax, %%rbx, 2), %%zmm8, %%zmm8 \n\t" + "vpaddd 0xC0(%%rax, %%rbx, 2), %%zmm11, %%zmm11 \n\t" + "vpaddd 0x100(%%rax, %%rbx, 2), %%zmm14, %%zmm14 \n\t" + "vpaddd 0x140(%%rax, %%rbx, 2), %%zmm17, %%zmm17 \n\t" + "vpaddd 0x180(%%rax, %%rbx, 2), %%zmm20, %%zmm20 \n\t" + "vpaddd 0x1C0(%%rax, %%rbx, 2), %%zmm23, %%zmm23 \n\t" + + ".align 16 \n\t" + "0: \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" "and $0xC, %%rcx \n\t" - "je 1f \n\t" + "je 4f \n\t" relu24Regs(%%zmm) + "jmp 4f \n\t" - ".align 16 \n\t" - "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" + ".align 16 \n\t" + "1: \n\t" convert24RegsI32ToF32(%[scale], %%zmm) - ".align 16 \n\t" - "2: \n\t" - "vmovups %%zmm0, (%%rax) \n\t" - "vmovups %%zmm3, 0x40(%%rax) \n\t" - "vmovups %%zmm6, 0x80(%%rax) \n\t" - "vmovups %%zmm9, 0xC0(%%rax) \n\t" - "vmovups %%zmm12, 0x100(%%rax) \n\t" - "vmovups %%zmm15, 0x140(%%rax) \n\t" - "vmovups %%zmm18, 0x180(%%rax) \n\t" - "vmovups %%zmm21, 0x1C0(%%rax) \n\t" - "vmovups %%zmm1, (%%rax, %%rbx) \n\t" - "vmovups %%zmm4, 0x40(%%rax, %%rbx) \n\t" - "vmovups %%zmm7, 0x80(%%rax, %%rbx) \n\t" - "vmovups %%zmm10, 0xC0(%%rax, %%rbx) \n\t" - "vmovups %%zmm13, 0x100(%%rax, %%rbx) \n\t" - "vmovups %%zmm16, 0x140(%%rax, %%rbx) \n\t" - "vmovups %%zmm19, 0x180(%%rax, %%rbx) \n\t" - "vmovups %%zmm22, 0x1C0(%%rax, %%rbx) \n\t" - "vmovups %%zmm2, (%%rax, %%rbx, 2) \n\t" - "vmovups %%zmm5, 0x40(%%rax, %%rbx, 2) \n\t" - "vmovups %%zmm8, 0x80(%%rax, %%rbx, 2) \n\t" - "vmovups %%zmm11, 0xC0(%%rax, %%rbx, 2) \n\t" - "vmovups %%zmm14, 0x100(%%rax, %%rbx, 2) \n\t" - "vmovups %%zmm17, 0x140(%%rax, %%rbx, 2) \n\t" - "vmovups %%zmm20, 0x180(%%rax, %%rbx, 2) \n\t" - "vmovups %%zmm23, 0x1C0(%%rax, %%rbx, 2) \n\t" + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + "vaddps 0x40(%[eltwise]), %%zmm3, %%zmm3 \n\t" + "vaddps 0x80(%[eltwise]), %%zmm6, %%zmm6 \n\t" + "vaddps 0xC0(%[eltwise]), %%zmm9, %%zmm9 \n\t" + "vaddps 0x100(%[eltwise]), %%zmm12, %%zmm12 \n\t" + "vaddps 0x140(%[eltwise]), %%zmm15, %%zmm15 \n\t" + "vaddps 0x180(%[eltwise]), %%zmm18, %%zmm18 \n\t" + "vaddps 0x1C0(%[eltwise]), %%zmm21, %%zmm21 \n\t" + "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1 \n\t" + "vaddps 0x40(%[eltwise], %%rbx), %%zmm4, %%zmm4 \n\t" + "vaddps 0x80(%[eltwise], %%rbx), %%zmm7, %%zmm7 \n\t" + "vaddps 0xC0(%[eltwise], %%rbx), %%zmm10, %%zmm10 \n\t" + "vaddps 0x100(%[eltwise], %%rbx), %%zmm13, %%zmm13 \n\t" + "vaddps 0x140(%[eltwise], %%rbx), %%zmm16, %%zmm16 \n\t" + "vaddps 0x180(%[eltwise], %%rbx), %%zmm19, %%zmm19 \n\t" + "vaddps 0x1C0(%[eltwise], %%rbx), %%zmm22, %%zmm22 \n\t" + "vaddps (%[eltwise], %%rbx, 2), %%zmm2, %%zmm2 \n\t" + "vaddps 0x40(%[eltwise], %%rbx, 2), %%zmm5, %%zmm5 \n\t" + "vaddps 0x80(%[eltwise], %%rbx, 2), %%zmm8, %%zmm8 \n\t" + "vaddps 0xC0(%[eltwise], %%rbx, 2), %%zmm11, %%zmm11 \n\t" + "vaddps 0x100(%[eltwise], %%rbx, 2), %%zmm14, %%zmm14 \n\t" + "vaddps 0x140(%[eltwise], %%rbx, 2), %%zmm17, %%zmm17 \n\t" + "vaddps 0x180(%[eltwise], %%rbx, 2), %%zmm20, %%zmm20 \n\t" + "vaddps 0x1C0(%[eltwise], %%rbx, 2), %%zmm23, %%zmm23 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + relu24RegsPs(%%zmm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%zmm0, (%%rax) \n\t" + "vmovups %%zmm3, 0x40(%%rax) \n\t" + "vmovups %%zmm6, 0x80(%%rax) \n\t" + "vmovups %%zmm9, 0xC0(%%rax) \n\t" + "vmovups %%zmm12, 0x100(%%rax) \n\t" + "vmovups %%zmm15, 0x140(%%rax) \n\t" + "vmovups %%zmm18, 0x180(%%rax) \n\t" + "vmovups %%zmm21, 0x1C0(%%rax) \n\t" + "vmovups %%zmm1, (%%rax, %%rbx) \n\t" + "vmovups %%zmm4, 0x40(%%rax, %%rbx) \n\t" + "vmovups %%zmm7, 0x80(%%rax, %%rbx) \n\t" + "vmovups %%zmm10, 0xC0(%%rax, %%rbx) \n\t" + "vmovups %%zmm13, 0x100(%%rax, %%rbx) \n\t" + "vmovups %%zmm16, 0x140(%%rax, %%rbx) \n\t" + "vmovups %%zmm19, 0x180(%%rax, %%rbx) \n\t" + "vmovups %%zmm22, 0x1C0(%%rax, %%rbx) \n\t" + "vmovups %%zmm2, (%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm5, 0x40(%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm8, 0x80(%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm11, 0xC0(%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm14, 0x100(%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm17, 0x140(%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm20, 0x180(%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm23, 0x1C0(%%rax, %%rbx, 2) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [eltwise] "r" (c.eltwise), + [ostepC16] "r" (c.ostepC16), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", + "%zmm30", "%zmm31", "memory", "cc"); } -void Avx512ConvKernel4x48(ConvController &c) { - convKernelForLoopXx48(12, 4) - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" - "je 0f \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd 0x40(%%rax), %%zmm3, %%zmm3 \n\t" - "vpaddd 0x80(%%rax), %%zmm6, %%zmm6 \n\t" - "vpaddd 0xC0(%%rax), %%zmm9, %%zmm9 \n\t" - "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" - "vpaddd 0x40(%%rax, %%rbx), %%zmm4, %%zmm4 \n\t" - "vpaddd 0x80(%%rax, %%rbx), %%zmm7, %%zmm7 \n\t" - "vpaddd 0xC0(%%rax, %%rbx), %%zmm10, %%zmm10 \n\t" - "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2 \n\t" - "vpaddd 0x40(%%rax, %%rbx, 2), %%zmm5, %%zmm5 \n\t" - "vpaddd 0x80(%%rax, %%rbx, 2), %%zmm8, %%zmm8 \n\t" - "vpaddd 0xC0(%%rax, %%rbx, 2), %%zmm11, %%zmm11 \n\t" +void Avx512ConvKernel4x48(ConvController &c) { + if (c.cross) { + convKernelForLoopXx48(12, 4, 1) + } else { + convKernelForLoopXx48(12, 4, 0) + } - ".align 16 \n\t" - "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" + "je 0f \n\t" + "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" + "vpaddd 0x40(%%rax), %%zmm3, %%zmm3 \n\t" + "vpaddd 0x80(%%rax), %%zmm6, %%zmm6 \n\t" + "vpaddd 0xC0(%%rax), %%zmm9, %%zmm9 \n\t" + "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" + "vpaddd 0x40(%%rax, %%rbx), %%zmm4, %%zmm4 \n\t" + "vpaddd 0x80(%%rax, %%rbx), %%zmm7, %%zmm7 \n\t" + "vpaddd 0xC0(%%rax, %%rbx), %%zmm10, %%zmm10 \n\t" + "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2 \n\t" + "vpaddd 0x40(%%rax, %%rbx, 2), %%zmm5, %%zmm5 \n\t" + "vpaddd 0x80(%%rax, %%rbx, 2), %%zmm8, %%zmm8 \n\t" + "vpaddd 0xC0(%%rax, %%rbx, 2), %%zmm11, %%zmm11 \n\t" + + ".align 16 \n\t" + "0: \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" relu12Regs(%%zmm) + "jmp 4f \n\t" - ".align 16 \n\t" - "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" + ".align 16 \n\t" + "1: \n\t" convert12RegsI32ToF32(%[scale], %%zmm) - ".align 16 \n\t" - "2: \n\t" + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + "vaddps 0x40(%[eltwise]), %%zmm3, %%zmm3 \n\t" + "vaddps 0x80(%[eltwise]), %%zmm6, %%zmm6 \n\t" + "vaddps 0xC0(%[eltwise]), %%zmm9, %%zmm9 \n\t" + "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1 \n\t" + "vaddps 0x40(%[eltwise], %%rbx), %%zmm4, %%zmm4 \n\t" + "vaddps 0x80(%[eltwise], %%rbx), %%zmm7, %%zmm7 \n\t" + "vaddps 0xC0(%[eltwise], %%rbx), %%zmm10, %%zmm10 \n\t" + "vaddps (%[eltwise], %%rbx, 2), %%zmm2, %%zmm2 \n\t" + "vaddps 0x40(%[eltwise], %%rbx, 2), %%zmm5, %%zmm5 \n\t" + "vaddps 0x80(%[eltwise], %%rbx, 2), %%zmm8, %%zmm8 \n\t" + "vaddps 0xC0(%[eltwise], %%rbx, 2), %%zmm11, %%zmm11 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + relu12RegsPs(%%zmm) + + ".align 16 \n\t" + "4: \n\t" "vmovups %%zmm0, (%%rax) \n\t" "vmovups %%zmm3, 0x40(%%rax) \n\t" "vmovups %%zmm6, 0x80(%%rax) \n\t" "vmovups %%zmm9, 0xC0(%%rax) \n\t" - "vmovups %%zmm1, (%%rax, %%rbx) \n\t" - "vmovups %%zmm4, 0x40(%%rax, %%rbx) \n\t" - "vmovups %%zmm7, 0x80(%%rax, %%rbx) \n\t" - "vmovups %%zmm10, 0xC0(%%rax, %%rbx) \n\t" - "vmovups %%zmm2, (%%rax, %%rbx, 2) \n\t" - "vmovups %%zmm5, 0x40(%%rax, %%rbx, 2) \n\t" - "vmovups %%zmm8, 0x80(%%rax, %%rbx, 2) \n\t" - "vmovups %%zmm11, 0xC0(%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm1, (%%rax, %%rbx) \n\t" + "vmovups %%zmm4, 0x40(%%rax, %%rbx) \n\t" + "vmovups %%zmm7, 0x80(%%rax, %%rbx) \n\t" + "vmovups %%zmm10, 0xC0(%%rax, %%rbx) \n\t" + "vmovups %%zmm2, (%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm5, 0x40(%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm8, 0x80(%%rax, %%rbx, 2) \n\t" + "vmovups %%zmm11, 0xC0(%%rax, %%rbx, 2) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [eltwise] "r" (c.eltwise), + [ostepC16] "r" (c.ostepC16), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", + "%zmm24", "%zmm31", "memory", "cc"); } void Avx512ConvKernel1x48(ConvController &c) { - convKernelForLoopXx48(3, 1) - - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" - "je 0f \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" - "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2 \n\t" + if (c.cross) { + convKernelForLoopXx48(3, 1, 1) + } else { + convKernelForLoopXx48(3, 1, 0) + } - ".align 16 \n\t" - "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" + "je 0f \n\t" + "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" + "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" + "vpaddd (%%rax, %%rbx, 2), %%zmm2, %%zmm2 \n\t" + + ".align 16 \n\t" + "0: \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" relu3Regs(%%zmm) + "jmp 4f \n\t" - ".align 16 \n\t" - "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" + ".align 16 \n\t" + "1: \n\t" convert3RegsI32ToF32(%[scale], %%zmm) - ".align 16 \n\t" - "2: \n\t" - "vmovups %%zmm0, (%%rax) \n\t" - "vmovups %%zmm1, (%%rax, %%rbx) \n\t" - "vmovups %%zmm2, (%%rax, %%rbx, 2) \n\t" + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1 \n\t" + "vaddps (%[eltwise], %%rbx, 2), %%zmm2, %%zmm2 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + relu3RegsPs(%%zmm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%zmm0, (%%rax) \n\t" + "vmovups %%zmm1, (%%rax, %%rbx) \n\t" + "vmovups %%zmm2, (%%rax, %%rbx, 2) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [eltwise] "r" (c.eltwise), + [ostepC16] "r" (c.ostepC16), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%zmm0", "%zmm1", "%zmm2", "%zmm24", "%zmm31", + "memory", "cc"); } -#define load32BiasTo2Regs(bias) \ - "vmovups ("#bias"), %%zmm0 \n\t" \ - "vmovups 0x40("#bias"), %%zmm1 \n\t" \ - -#define load32BiasTo12Regs(bias) \ - load32BiasTo2Regs(bias) \ - "vmovups %%zmm0, %%zmm2 \n\t" \ - "vmovups %%zmm1, %%zmm3 \n\t" \ - "vmovups %%zmm0, %%zmm4 \n\t" \ - "vmovups %%zmm1, %%zmm5 \n\t" \ - "vmovups %%zmm0, %%zmm6 \n\t" \ - "vmovups %%zmm1, %%zmm7 \n\t" \ - "vmovups %%zmm0, %%zmm8 \n\t" \ - "vmovups %%zmm1, %%zmm9 \n\t" \ - "vmovups %%zmm0, %%zmm10 \n\t" \ - "vmovups %%zmm1, %%zmm11 \n\t" - -#define load32BiasTo24Regs(bias) \ - load32BiasTo12Regs(bias) \ - "vmovups %%zmm0, %%zmm12 \n\t" \ - "vmovups %%zmm1, %%zmm13 \n\t" \ - "vmovups %%zmm0, %%zmm14 \n\t" \ - "vmovups %%zmm1, %%zmm15 \n\t" \ - "vmovups %%zmm0, %%zmm16 \n\t" \ - "vmovups %%zmm1, %%zmm17 \n\t" \ - "vmovups %%zmm0, %%zmm18 \n\t" \ - "vmovups %%zmm1, %%zmm19 \n\t" \ - "vmovups %%zmm0, %%zmm20 \n\t" \ - "vmovups %%zmm1, %%zmm21 \n\t" \ - "vmovups %%zmm0, %%zmm22 \n\t" \ - "vmovups %%zmm1, %%zmm23 \n\t" - #ifdef _USE_AVX512_VNNI -#define convKernel12x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \ - "vpbroadcastd ("#input"), %%zmm28 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm29 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpdpbusd "#freg0", %%zmm28, %%zmm0 \n\t" \ - "vpdpbusd "#freg1", %%zmm28, %%zmm1 \n\t" \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm31 \n\t" \ - "vpdpbusd "#freg0", %%zmm29, %%zmm2 \n\t" \ - "vpdpbusd "#freg1", %%zmm29, %%zmm3 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpdpbusd "#freg0", %%zmm30, %%zmm4 \n\t" \ - "vpdpbusd "#freg1", %%zmm30, %%zmm5 \n\t" \ - "vpbroadcastd ("#input"), %%zmm28 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm29 \n\t" \ - "vpdpbusd "#freg0", %%zmm31, %%zmm6 \n\t" \ - "vpdpbusd "#freg1", %%zmm31, %%zmm7 \n\t" \ - "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpdpbusd "#freg0", %%zmm28, %%zmm8 \n\t" \ - "vpdpbusd "#freg1", %%zmm28, %%zmm9 \n\t" \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm31 \n\t" \ - "vpdpbusd "#freg0", %%zmm29, %%zmm10 \n\t" \ - "vpdpbusd "#freg1", %%zmm29, %%zmm11 \n\t" \ - "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpdpbusd "#freg0", %%zmm30, %%zmm12 \n\t" \ - "vpdpbusd "#freg1", %%zmm30, %%zmm13 \n\t" \ - "vpbroadcastd ("#input"), %%zmm28 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm29 \n\t" \ - "vpdpbusd "#freg0", %%zmm31, %%zmm14 \n\t" \ - "vpdpbusd "#freg1", %%zmm31, %%zmm15 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpdpbusd "#freg0", %%zmm28, %%zmm16 \n\t" \ - "vpdpbusd "#freg1", %%zmm28, %%zmm17 \n\t" \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm31 \n\t" \ - "vpdpbusd "#freg0", %%zmm29, %%zmm18 \n\t" \ - "vpdpbusd "#freg1", %%zmm29, %%zmm19 \n\t" \ - "vpdpbusd "#freg0", %%zmm30, %%zmm20 \n\t" \ - "vpdpbusd "#freg1", %%zmm30, %%zmm21 \n\t" \ - "vpdpbusd "#freg0", %%zmm31, %%zmm22 \n\t" \ - "vpdpbusd "#freg1", %%zmm31, %%zmm23 \n\t" - -#define convKernel6x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \ - "vpbroadcastd ("#input"), %%zmm28 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm29 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ - "vpdpbusd "#freg0", %%zmm28, %%zmm0 \n\t" \ - "vpdpbusd "#freg1", %%zmm28, %%zmm1 \n\t" \ - "vpbroadcastd ("#input"), %%zmm30 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm31 \n\t" \ - "vpdpbusd "#freg0", %%zmm29, %%zmm2 \n\t" \ - "vpdpbusd "#freg1", %%zmm29, %%zmm3 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpdpbusd "#freg0", %%zmm30, %%zmm4 \n\t" \ - "vpdpbusd "#freg1", %%zmm30, %%zmm5 \n\t" \ - "vpbroadcastd ("#input"), %%zmm28 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm29 \n\t" \ - "vpdpbusd "#freg0", %%zmm31, %%zmm6 \n\t" \ - "vpdpbusd "#freg1", %%zmm31, %%zmm7 \n\t" \ - "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ - "vpdpbusd "#freg0", %%zmm28, %%zmm8 \n\t" \ - "vpdpbusd "#freg1", %%zmm28, %%zmm9 \n\t" \ - "vpdpbusd "#freg0", %%zmm29, %%zmm10 \n\t" \ - "vpdpbusd "#freg1", %%zmm29, %%zmm11 \n\t" - -#define convKernel1x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \ - "vpbroadcastd ("#input"), %%zmm28 \n\t" \ - "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ - "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ - "vpdpbusd "#freg0", %%zmm28, %%zmm0 \n\t" \ - "vpdpbusd "#freg1", %%zmm28, %%zmm1 \n\t" +#define convKernel12x32c4_1(input, freg0, freg1, off0, off1, preg0, preg1) \ + "movq (%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), %%zmm28 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm29 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x8(%[stepC16]), "#input" \n\t" \ + "movq 0x10(%[stepC16]), %%r10 \n\t" \ + "vpdpbusd "#freg0", %%zmm28, %%zmm0 \n\t" \ + "vpdpbusd "#freg1", %%zmm28, %%zmm1 \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm29, %%zmm2 \n\t" \ + "vpdpbusd "#freg1", %%zmm29, %%zmm3 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x18(%[stepC16]), "#input" \n\t" \ + "movq 0x20(%[stepC16]), %%r10 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm4 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm5 \n\t" \ + "vpbroadcastd ("#input"), %%zmm28 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm29 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm6 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm7 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x28(%[stepC16]), "#input" \n\t" \ + "movq 0x30(%[stepC16]), %%r10 \n\t" \ + "vpdpbusd "#freg0", %%zmm28, %%zmm8 \n\t" \ + "vpdpbusd "#freg1", %%zmm28, %%zmm9 \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm29, %%zmm10 \n\t" \ + "vpdpbusd "#freg1", %%zmm29, %%zmm11 \n\t" \ + "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x38(%[stepC16]), "#input" \n\t" \ + "movq 0x40(%[stepC16]), %%r10 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm12 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm13 \n\t" \ + "vpbroadcastd ("#input"), %%zmm28 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm29 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm14 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm15 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x48(%[stepC16]), "#input" \n\t" \ + "movq 0x50(%[stepC16]), %%r10 \n\t" \ + "vpdpbusd "#freg0", %%zmm28, %%zmm16 \n\t" \ + "vpdpbusd "#freg1", %%zmm28, %%zmm17 \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm29, %%zmm18 \n\t" \ + "vpdpbusd "#freg1", %%zmm29, %%zmm19 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm20 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm21 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm22 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm23 \n\t" + +#define convKernel6x32c4_1(input, freg0, freg1, off0, off1, preg0, preg1) \ + "movq (%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), %%zmm28 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm29 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x8(%[stepC16]), "#input" \n\t" \ + "movq 0x10(%[stepC16]), %%r10 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vpdpbusd "#freg0", %%zmm28, %%zmm0 \n\t" \ + "vpdpbusd "#freg1", %%zmm28, %%zmm1 \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm29, %%zmm2 \n\t" \ + "vpdpbusd "#freg1", %%zmm29, %%zmm3 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x18(%[stepC16]), "#input" \n\t" \ + "movq 0x20(%[stepC16]), %%r10 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm4 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm5 \n\t" \ + "vpbroadcastd ("#input"), %%zmm28 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm29 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm6 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm7 \n\t" \ + "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ + "vpdpbusd "#freg0", %%zmm28, %%zmm8 \n\t" \ + "vpdpbusd "#freg1", %%zmm28, %%zmm9 \n\t" \ + "vpdpbusd "#freg0", %%zmm29, %%zmm10 \n\t" \ + "vpdpbusd "#freg1", %%zmm29, %%zmm11 \n\t" + +#define convKernel1x32c4_1(input, freg0, freg1, off0, off1, preg0, preg1) \ + "vpbroadcastd ("#input"), %%zmm28 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ + "vpdpbusd "#freg0", %%zmm28, %%zmm0 \n\t" \ + "vpdpbusd "#freg1", %%zmm28, %%zmm1 \n\t" + +#define convKernel12x32c4_0(input, freg0, freg1, off0, off1, preg0, preg1) \ + "vpbroadcastd ("#input"), %%zmm28 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm29 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpdpbusd "#freg0", %%zmm28, %%zmm0 \n\t" \ + "vpdpbusd "#freg1", %%zmm28, %%zmm1 \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm29, %%zmm2 \n\t" \ + "vpdpbusd "#freg1", %%zmm29, %%zmm3 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm4 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm5 \n\t" \ + "vpbroadcastd ("#input"), %%zmm28 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm29 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm6 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm7 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpdpbusd "#freg0", %%zmm28, %%zmm8 \n\t" \ + "vpdpbusd "#freg1", %%zmm28, %%zmm9 \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm29, %%zmm10 \n\t" \ + "vpdpbusd "#freg1", %%zmm29, %%zmm11 \n\t" \ + "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm12 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm13 \n\t" \ + "vpbroadcastd ("#input"), %%zmm28 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm29 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm14 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm15 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpdpbusd "#freg0", %%zmm28, %%zmm16 \n\t" \ + "vpdpbusd "#freg1", %%zmm28, %%zmm17 \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm29, %%zmm18 \n\t" \ + "vpdpbusd "#freg1", %%zmm29, %%zmm19 \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm20 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm21 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm22 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm23 \n\t" + +#define convKernel6x32c4_0(input, freg0, freg1, off0, off1, preg0, preg1) \ + "vpbroadcastd ("#input"), %%zmm28 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm29 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vpdpbusd "#freg0", %%zmm28, %%zmm0 \n\t" \ + "vpdpbusd "#freg1", %%zmm28, %%zmm1 \n\t" \ + "vpbroadcastd ("#input"), %%zmm30 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm31 \n\t" \ + "vpdpbusd "#freg0", %%zmm29, %%zmm2 \n\t" \ + "vpdpbusd "#freg1", %%zmm29, %%zmm3 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpdpbusd "#freg0", %%zmm30, %%zmm4 \n\t" \ + "vpdpbusd "#freg1", %%zmm30, %%zmm5 \n\t" \ + "vpbroadcastd ("#input"), %%zmm28 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm29 \n\t" \ + "vpdpbusd "#freg0", %%zmm31, %%zmm6 \n\t" \ + "vpdpbusd "#freg1", %%zmm31, %%zmm7 \n\t" \ + "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ + "vpdpbusd "#freg0", %%zmm28, %%zmm8 \n\t" \ + "vpdpbusd "#freg1", %%zmm28, %%zmm9 \n\t" \ + "vpdpbusd "#freg0", %%zmm29, %%zmm10 \n\t" \ + "vpdpbusd "#freg1", %%zmm29, %%zmm11 \n\t" + +#define convKernel1x32c4_0(input, freg0, freg1, off0, off1, preg0, preg1) \ + "vpbroadcastd ("#input"), %%zmm28 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vmovups "#off1"(%[filter]), "#preg1" \n\t" \ + "vpdpbusd "#freg0", %%zmm28, %%zmm0 \n\t" \ + "vpdpbusd "#freg1", %%zmm28, %%zmm1 \n\t" + #else + #define convKernel12x32c4_3(input, freg0, freg1, off0, off1, preg0, preg1, preg2) \ - "vpbroadcastd ("#input"), %%zmm29 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm30 \n\t" \ - "vpmaddubsw "#freg0", %%zmm29, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm29, "#preg1" \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg2" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg1" \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg0", %%zmm29, "#preg1" \n\t" \ - "vpmaddubsw "#freg1", %%zmm29, "#preg2" \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm30 \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg1" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg2" \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ "vpaddd %%zmm3, "#preg0", %%zmm3 \n\t" \ "vpaddd %%zmm4, "#preg1", %%zmm4 \n\t" \ "vpaddd %%zmm5, "#preg2", %%zmm5 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg0", %%zmm29, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm30 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \ "vpaddd %%zmm6, "#preg0", %%zmm6 \n\t" \ "vpaddd %%zmm7, "#preg1", %%zmm7 \n\t" \ "vpaddd %%zmm8, "#preg2", %%zmm8 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddubsw "#freg1", %%zmm29, "#preg0" \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg2" \n\t" \ - "vpbroadcastd ("#input"), %%zmm29 \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm30 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \ "vpaddd %%zmm9, "#preg0", %%zmm9 \n\t" \ - "vpaddd %%zmm10, "#preg1", %%zmm10 \n\t" \ - "vpaddd %%zmm11, "#preg2", %%zmm11 \n\t" \ - "vpmaddubsw "#freg0", %%zmm29, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm29, "#preg1" \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg2" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd ("#input"), %%zmm29 \n\t" \ - "vpaddd %%zmm12, "#preg0", %%zmm12 \n\t" \ - "vpaddd %%zmm13, "#preg1", %%zmm13 \n\t" \ - "vpaddd %%zmm14, "#preg2", %%zmm14 \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg0", %%zmm29, "#preg1" \n\t" \ - "vpmaddubsw "#freg1", %%zmm29, "#preg2" \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm30 \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpbroadcastd ("#input"), %%zmm29 \n\t" \ - "vpaddd %%zmm15, "#preg0", %%zmm15 \n\t" \ - "vpaddd %%zmm16, "#preg1", %%zmm16 \n\t" \ - "vpaddd %%zmm17, "#preg2", %%zmm17 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg0", %%zmm29, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm30 \n\t" \ - "vpaddd %%zmm18, "#preg0", %%zmm18 \n\t" \ - "vpaddd %%zmm19, "#preg1", %%zmm19 \n\t" \ - "vpaddd %%zmm20, "#preg2", %%zmm20 \n\t" \ - "vpmaddubsw "#freg1", %%zmm29, "#preg0" \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ - "vpaddd %%zmm21, "#preg0", %%zmm21 \n\t" \ - "vpaddd %%zmm22, "#preg1", %%zmm22 \n\t" \ - "vpaddd %%zmm23, "#preg2", %%zmm23 \n\t" + "vpaddd %%zmm10, "#preg1", %%zmm10 \n\t" \ + "vpaddd %%zmm11, "#preg2", %%zmm11 \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg1" \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpaddd %%zmm12, "#preg0", %%zmm12 \n\t" \ + "vpaddd %%zmm13, "#preg1", %%zmm13 \n\t" \ + "vpaddd %%zmm14, "#preg2", %%zmm14 \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg1" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg2" \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpaddd %%zmm15, "#preg0", %%zmm15 \n\t" \ + "vpaddd %%zmm16, "#preg1", %%zmm16 \n\t" \ + "vpaddd %%zmm17, "#preg2", %%zmm17 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \ + "vpaddd %%zmm18, "#preg0", %%zmm18 \n\t" \ + "vpaddd %%zmm19, "#preg1", %%zmm19 \n\t" \ + "vpaddd %%zmm20, "#preg2", %%zmm20 \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ + "vpaddd %%zmm21, "#preg0", %%zmm21 \n\t" \ + "vpaddd %%zmm22, "#preg1", %%zmm22 \n\t" \ + "vpaddd %%zmm23, "#preg2", %%zmm23 \n\t" #define convKernel6x32c4_3(input, freg0, freg1, off0, off1, preg0, preg1, preg2) \ - "vpbroadcastd ("#input"), %%zmm29 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm30 \n\t" \ - "vpmaddubsw "#freg0", %%zmm29, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm29, "#preg1" \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg2" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg1" \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg0", %%zmm29, "#preg1" \n\t" \ - "vpmaddubsw "#freg1", %%zmm29, "#preg2" \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm30 \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg1" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg2" \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ "vpaddd %%zmm3, "#preg0", %%zmm3 \n\t" \ "vpaddd %%zmm4, "#preg1", %%zmm4 \n\t" \ "vpaddd %%zmm5, "#preg2", %%zmm5 \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg0", %%zmm29, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), %%zmm30 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \ "vpaddd %%zmm6, "#preg0", %%zmm6 \n\t" \ "vpaddd %%zmm7, "#preg1", %%zmm7 \n\t" \ "vpaddd %%zmm8, "#preg2", %%zmm8 \n\t" \ - "vpmaddubsw "#freg1", %%zmm29, "#preg0" \n\t" \ - "vpmaddubsw "#freg0", %%zmm30, "#preg1" \n\t" \ - "vpmaddubsw "#freg1", %%zmm30, "#preg2" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ - "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ - "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ "vpaddd %%zmm9, "#preg0", %%zmm9 \n\t" \ - "vpaddd %%zmm10, "#preg1", %%zmm10 \n\t" \ - "vpaddd %%zmm11, "#preg2", %%zmm11 \n\t" + "vpaddd %%zmm10, "#preg1", %%zmm10 \n\t" \ + "vpaddd %%zmm11, "#preg2", %%zmm11 \n\t" #define convKernel1x32c4_3(input, freg0, freg1, off0, off1, preg0, preg1, preg2) \ - "vpbroadcastd ("#input"), %%zmm29 \n\t" \ - "vpmaddubsw "#freg0", %%zmm29, "#preg0" \n\t" \ - "vpmaddubsw "#freg1", %%zmm29, "#preg1" \n\t" \ - "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ - "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ - "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ - "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg1" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ + "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ + "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" + +#define convKernel12x32c4_4(input, freg0, freg1, off0, off1, preg0, preg1, preg2) \ + "movq (%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg1" \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x8(%[stepC16]), "#input" \n\t" \ + "movq 0x10(%[stepC16]), %%r10 \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ + "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ + "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg1" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg2" \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x18(%[stepC16]), "#input" \n\t" \ + "movq 0x20(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpaddd %%zmm3, "#preg0", %%zmm3 \n\t" \ + "vpaddd %%zmm4, "#preg1", %%zmm4 \n\t" \ + "vpaddd %%zmm5, "#preg2", %%zmm5 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \ + "vpaddd %%zmm6, "#preg0", %%zmm6 \n\t" \ + "vpaddd %%zmm7, "#preg1", %%zmm7 \n\t" \ + "vpaddd %%zmm8, "#preg2", %%zmm8 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x28(%[stepC16]), "#input" \n\t" \ + "movq 0x30(%[stepC16]), %%r10 \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \ + "vpaddd %%zmm9, "#preg0", %%zmm9 \n\t" \ + "vpaddd %%zmm10, "#preg1", %%zmm10 \n\t" \ + "vpaddd %%zmm11, "#preg2", %%zmm11 \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg1" \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x38(%[stepC16]), "#input" \n\t" \ + "movq 0x40(%[stepC16]), %%r10 \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpaddd %%zmm12, "#preg0", %%zmm12 \n\t" \ + "vpaddd %%zmm13, "#preg1", %%zmm13 \n\t" \ + "vpaddd %%zmm14, "#preg2", %%zmm14 \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg1" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg2" \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x48(%[stepC16]), "#input" \n\t" \ + "movq 0x50(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpaddd %%zmm15, "#preg0", %%zmm15 \n\t" \ + "vpaddd %%zmm16, "#preg1", %%zmm16 \n\t" \ + "vpaddd %%zmm17, "#preg2", %%zmm17 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \ + "vpaddd %%zmm18, "#preg0", %%zmm18 \n\t" \ + "vpaddd %%zmm19, "#preg1", %%zmm19 \n\t" \ + "vpaddd %%zmm20, "#preg2", %%zmm20 \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ + "vpaddd %%zmm21, "#preg0", %%zmm21 \n\t" \ + "vpaddd %%zmm22, "#preg1", %%zmm22 \n\t" \ + "vpaddd %%zmm23, "#preg2", %%zmm23 \n\t" + +#define convKernel6x32c4_4(input, freg0, freg1, off0, off1, preg0, preg1, preg2) \ + "movq (%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg1" \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x8(%[stepC16]), "#input" \n\t" \ + "movq 0x10(%[stepC16]), %%r10 \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ + "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" \ + "vpaddd %%zmm2, "#preg2", %%zmm2 \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg1" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg2" \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x18(%[stepC16]), "#input" \n\t" \ + "movq 0x20(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpaddd %%zmm3, "#preg0", %%zmm3 \n\t" \ + "vpaddd %%zmm4, "#preg1", %%zmm4 \n\t" \ + "vpaddd %%zmm5, "#preg2", %%zmm5 \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vpbroadcastd ("#input", %%r10), %%zmm30 \n\t" \ + "vpaddd %%zmm6, "#preg0", %%zmm6 \n\t" \ + "vpaddd %%zmm7, "#preg1", %%zmm7 \n\t" \ + "vpaddd %%zmm8, "#preg2", %%zmm8 \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg0", %%zmm30, "#preg1" \n\t" \ + "vpmaddubsw "#freg1", %%zmm30, "#preg2" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpmaddwd "#preg2", %%zmm31, "#preg2" \n\t" \ + "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ + "vpaddd %%zmm9, "#preg0", %%zmm9 \n\t" \ + "vpaddd %%zmm10, "#preg1", %%zmm10 \n\t" \ + "vpaddd %%zmm11, "#preg2", %%zmm11 \n\t" + +#define convKernel1x32c4_4(input, freg0, freg1, off0, off1, preg0, preg1, preg2) \ + "vpbroadcastd ("#input"), %%zmm29 \n\t" \ + "vpmaddubsw "#freg0", %%zmm29, "#preg0" \n\t" \ + "vpmaddubsw "#freg1", %%zmm29, "#preg1" \n\t" \ + "vpmaddwd "#preg0", %%zmm31, "#preg0" \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpmaddwd "#preg1", %%zmm31, "#preg1" \n\t" \ + "vmovups "#off1"(%[filter]), "#freg1" \n\t" \ "vpaddd %%zmm0, "#preg0", %%zmm0 \n\t" \ "vpaddd %%zmm1, "#preg1", %%zmm1 \n\t" -#define convKernel12x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \ +#define convKernel12x32c4_0(input, freg0, freg1, off0, off1, preg0, preg1) \ convKernel12x32c4_3(input, %%zmm24, %%zmm25, off0, off1, %%zmm26, %%zmm27, %%zmm28) -#define convKernel6x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \ +#define convKernel6x32c4_0(input, freg0, freg1, off0, off1, preg0, preg1) \ convKernel6x32c4_3(input, %%zmm24, %%zmm25, off0, off1, %%zmm26, %%zmm27, %%zmm28) -#define convKernel1x32c4(input, freg0, freg1, off0, off1, preg0, preg1) \ +#define convKernel1x32c4_0(input, freg0, freg1, off0, off1, preg0, preg1) \ convKernel1x32c4_3(input, %%zmm24, %%zmm25, off0, off1, %%zmm26, %%zmm27, %%zmm28) + +#define convKernel12x32c4_1(input, freg0, freg1, off0, off1, preg0, preg1) \ + convKernel12x32c4_4(input, %%zmm24, %%zmm25, off0, off1, %%zmm26, %%zmm27, %%zmm28) + +#define convKernel6x32c4_1(input, freg0, freg1, off0, off1, preg0, preg1) \ + convKernel6x32c4_4(input, %%zmm24, %%zmm25, off0, off1, %%zmm26, %%zmm27, %%zmm28) + +#define convKernel1x32c4_1(input, freg0, freg1, off0, off1, preg0, preg1) \ + convKernel1x32c4_4(input, %%zmm24, %%zmm25, off0, off1, %%zmm26, %%zmm27, %%zmm28) + #endif -#define convKernelForLoopXx32(rnum, wsize) \ - __asm__ __volatile__("vmovups (%[filter]), %%zmm24 \n\t" \ - "vmovups 0x40(%[filter]), %%zmm25 \n\t" \ - "addq $0x80, %[filter] \n\t" \ - "mov $1, %%eax \n\t" \ - "vmovd %%eax, %%xmm0 \n\t" \ - "vpbroadcastw %%xmm0, %%zmm31 \n\t" \ - "movq %[flags], %%rax \n\t" \ - "andq $0x1, %%rax \n\t" \ - "jne 0f \n\t" \ - load32BiasTo##rnum##Regs(%[bias]) \ - "cmpq $0x10, %%rcx \n\t" \ - "jl 4f \n\t" \ - "jmp 1f \n\t" \ - ".align 16 \n\t" \ - "0: \n\t" \ - clear##rnum##Regs(%%zmm) \ - "cmpq $0x10, %%rcx \n\t" \ - "jl 4f \n\t" \ - ".align 16 \n\t" \ - "1: \n\t" \ - "mov %[kh], %%rbx \n\t" \ - ".align 16 \n\t" \ - "2: \n\t" \ - "mov %[kw], %%r9 \n\t" \ - ".align 16 \n\t" \ - "3: \n\t" \ - "movq %[input], %%rax \n\t" \ - convKernel##wsize##x32c4(%%rax, %%zmm24, %%zmm25, 0x0, 0x40, %%zmm26, %%zmm27) \ - "movq %[input], %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - convKernel##wsize##x32c4(%%rax, %%zmm26, %%zmm27, 0x80, 0xC0, %%zmm24, %%zmm25) \ - "movq %[input], %%rax \n\t" \ - "addq $0x8, %%rax \n\t" \ - convKernel##wsize##x32c4(%%rax, %%zmm24, %%zmm25, 0x100, 0x140, %%zmm26, %%zmm27) \ - "movq %[input], %%rax \n\t" \ - "addq $0xC, %%rax \n\t" \ - convKernel##wsize##x32c4(%%rax, %%zmm26, %%zmm27, 0x180, 0x1C0, %%zmm24, %%zmm25) \ - "addq $0x200, %[filter] \n\t" \ - "addq %[dilateW], %[input] \n\t" \ - "dec %%r9 \n\t" \ - "jg 3b \n\t" \ - "addq %[dilateH], %[input] \n\t" \ - "dec %%rbx \n\t" \ - "jg 2b \n\t" \ - "addq %[fStep], %[input] \n\t" \ - "subq $0x10, %%rcx \n\t" \ - "cmpq $0x10, %%rcx \n\t" \ - "jge 1b \n\t" \ - "subq %[fStep], %[input] \n\t" \ - "addq %[f8Step], %[input] \n\t" \ - ".align 16 \n\t" \ - "4: \n\t" \ - : "+c" (c.ic), [input] "+r" (c.input), [filter] "+r" (c.filter) \ - : [bias] "r" (c.bias), [kh] "r" (c.kh), [kw] "r" (c.kw), \ - [stepC16] "r" (c.stepC16), [dilateW] "r" (c.dilateW), \ - [dilateH] "r" (c.dilateH), [fStep] "r" (c.fStep), [flags] "r" (c.flags), \ - [f8Step] "r" (c.f8Step) \ - : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \ - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", \ - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \ - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \ - "%zmm31", "memory", "cc"); \ +#define convKernelForLoopXx32(rnum, wsize, cross) \ + __asm__ __volatile__("vmovups (%[filter]), %%zmm24 \n\t" \ + "vmovups 0x40(%[filter]), %%zmm25 \n\t" \ + "addq $0x80, %[filter] \n\t" \ + "mov $1, %%eax \n\t" \ + "vmovd %%eax, %%xmm0 \n\t" \ + "vpbroadcastw %%xmm0, %%zmm31 \n\t" \ + "movq %[flags], %%rax \n\t" \ + "andq $0x1, %%rax \n\t" \ + "jne 0f \n\t" \ + load32BiasTo##rnum##Regs(%[bias]) \ + "jmp 1f \n\t" \ + ".align 16 \n\t" \ + "0: \n\t" \ + clear##rnum##Regs(%%zmm) \ + ".align 16 \n\t" \ + "1: \n\t" \ + : [filter] "+r" (c.filter) \ + : [bias] "r" (c.bias), \ + [flags] "r" (c.flags) \ + : "%rax", \ + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", \ + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", \ + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \ + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \ + "memory", "cc"); \ + if (c.ic >= 16) { \ + __asm__ __volatile__("movq (%[stepC16]), %%r10 \n\t" \ + ".align 16 \n\t" \ + "1: \n\t" \ + "mov %[kh], %%rbx \n\t" \ + ".align 16 \n\t" \ + "2: \n\t" \ + "mov %[kw], %%r9 \n\t" \ + ".align 16 \n\t" \ + "3: \n\t" \ + "movq %[input], %%rax \n\t" \ + convKernel##wsize##x32c4_##cross( \ + %%rax, %%zmm24, %%zmm25, 0x0, 0x40, %%zmm26, %%zmm27) \ + "movq %[input], %%rax \n\t" \ + "addq $0x4, %%rax \n\t" \ + convKernel##wsize##x32c4_##cross( \ + %%rax, %%zmm26, %%zmm27, 0x80, 0xC0, %%zmm24, %%zmm25) \ + "movq %[input], %%rax \n\t" \ + "addq $0x8, %%rax \n\t" \ + convKernel##wsize##x32c4_##cross( \ + %%rax, %%zmm24, %%zmm25, 0x100, 0x140, %%zmm26, %%zmm27) \ + "movq %[input], %%rax \n\t" \ + "addq $0xC, %%rax \n\t" \ + convKernel##wsize##x32c4_##cross( \ + %%rax, %%zmm26, %%zmm27, 0x180, 0x1C0, %%zmm24, %%zmm25) \ + "addq $0x200, %[filter] \n\t" \ + "addq %[dilateW], %[input] \n\t" \ + "dec %%r9 \n\t" \ + "jg 3b \n\t" \ + "addq %[dilateH], %[input] \n\t" \ + "dec %%rbx \n\t" \ + "jg 2b \n\t" \ + "addq %[fStep], %[input] \n\t" \ + "subq $0x10, %%rcx \n\t" \ + "cmpq $0x10, %%rcx \n\t" \ + "jge 1b \n\t" \ + "subq %[fStep], %[input] \n\t" \ + "addq %[f8Step], %[input] \n\t" \ + ".align 16 \n\t" \ + "4: \n\t" \ + : "+c" (c.ic), \ + [input] "+r" (c.input), \ + [filter] "+r" (c.filter) \ + : [kh] "r" (c.kh), \ + [kw] "r" (c.kw), \ + [stepC16] "r" (c.stepC16), \ + [dilateW] "r" (c.dilateW), \ + [dilateH] "r" (c.dilateH), \ + [fStep] "r" (c.fStep), \ + [f8Step] "r" (c.f8Step) \ + : "%rax", "%rbx", "%r9", "%r10", \ + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", \ + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", \ + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \ + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \ + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", \ + "%zmm30", "%zmm31", "memory", "cc"); \ + } \ if (c.ic > 0) { \ - __asm__ __volatile__("cmpq $0x8, %%rcx \n\t" \ - "jl 2f \n\t" \ - "subq $0x8, %%rcx \n\t" \ - "shr $1, %[dilateW] \n\t" \ - "shr $1, %[dilateH] \n\t" \ - "shr $1, %[fStep] \n\t" \ - "shr $1, %[stepC16] \n\t" \ - "mov %[kh], %%rbx \n\t" \ - ".align 16 \n\t" \ - "0: \n\t" \ - "mov %[kw], %%r9 \n\t" \ - ".align 16 \n\t" \ - "1: \n\t" \ - "movq %[input], %%rax \n\t" \ - convKernel##wsize##x32c4(%%rax, %%zmm24, %%zmm25, 0x0, 0x40, %%zmm26, %%zmm27) \ - "movq %[input], %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - convKernel##wsize##x32c4(%%rax, %%zmm26, %%zmm27, 0x80, 0xC0, %%zmm24, %%zmm25) \ - "addq $0x100, %[filter] \n\t" \ - "addq %[dilateW], %[input] \n\t" \ - "dec %%r9 \n\t" \ - "jg 1b \n\t" \ - "addq %[dilateH], %[input] \n\t" \ - "dec %%rbx \n\t" \ - "jg 0b \n\t" \ - "addq %[f4Step], %[input] \n\t" \ - ".align 16 \n\t" \ - "2: \n\t" \ - "cmpq $0x4, %%rcx \n\t" \ - "jl 5f \n\t" \ - "shr $1, %[dilateW] \n\t" \ - "shr $1, %[dilateH] \n\t" \ - "shr $1, %[stepC16] \n\t" \ - "mov %[kh], %%rbx \n\t" \ - ".align 16 \n\t" \ - "3: \n\t" \ - "mov %[kw], %%r9 \n\t" \ - ".align 16 \n\t" \ - "4: \n\t" \ - "movq %[input], %%rax \n\t" \ - convKernel##wsize##x32c4(%%rax, %%zmm24, %%zmm25, 0x0, 0x40, %%zmm26, %%zmm27) \ - "addq $0x80, %[filter] \n\t" \ - "addq %[dilateW], %[input] \n\t" \ - "dec %%r9 \n\t" \ - "jg 4b \n\t" \ - "addq %[dilateH], %[input] \n\t" \ - "dec %%rbx \n\t" \ - "jg 3b \n\t" \ - ".align 16 \n\t" \ - "5: \n\t" \ - : "+c" (c.ic) \ - : [input] "r" (c.input), [filter] "r" (c.filter), [bias] "r" (c.bias), [kh] "r" (c.kh), [kw] "r" (c.kw), \ - [stepC16] "r" (c.stepC16), [dilateW] "r" (c.dilateW), \ - [dilateH] "r" (c.dilateH), [fStep] "r" (c.fStep), \ - [f4Step] "r" (c.f4Step) \ - : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \ - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", \ - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \ - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \ - "%zmm31", "memory", "cc"); \ + __asm__ __volatile__("cmpq $0x8, %%rcx \n\t" \ + "jl 2f \n\t" \ + "subq $0x8, %%rcx \n\t" \ + "shr $1, %[dilateW] \n\t" \ + "shr $1, %[dilateH] \n\t" \ + "shr $1, %[fStep] \n\t" \ + "addq $192, %[stepC16] \n\t" \ + "mov %[kh], %%rbx \n\t" \ + ".align 16 \n\t" \ + "0: \n\t" \ + "mov %[kw], %%r9 \n\t" \ + ".align 16 \n\t" \ + "1: \n\t" \ + "movq %[input], %%rax \n\t" \ + convKernel##wsize##x32c4_##cross( \ + %%rax, %%zmm24, %%zmm25, 0x0, 0x40, %%zmm26, %%zmm27) \ + "movq %[input], %%rax \n\t" \ + "addq $0x4, %%rax \n\t" \ + convKernel##wsize##x32c4_##cross( \ + %%rax, %%zmm26, %%zmm27, 0x80, 0xC0, %%zmm24, %%zmm25) \ + "addq $0x100, %[filter] \n\t" \ + "addq %[dilateW], %[input] \n\t" \ + "dec %%r9 \n\t" \ + "jg 1b \n\t" \ + "addq %[dilateH], %[input] \n\t" \ + "dec %%rbx \n\t" \ + "jg 0b \n\t" \ + "addq %[f4Step], %[input] \n\t" \ + ".align 16 \n\t" \ + "2: \n\t" \ + "cmpq $0x4, %%rcx \n\t" \ + "jl 5f \n\t" \ + "shr $1, %[dilateW] \n\t" \ + "shr $1, %[dilateH] \n\t" \ + "addq $192, %[stepC16] \n\t" \ + "mov %[kh], %%rbx \n\t" \ + ".align 16 \n\t" \ + "3: \n\t" \ + "mov %[kw], %%r9 \n\t" \ + ".align 16 \n\t" \ + "4: \n\t" \ + "movq %[input], %%rax \n\t" \ + convKernel##wsize##x32c4_##cross( \ + %%rax, %%zmm24, %%zmm25, 0x0, 0x40, %%zmm26, %%zmm27) \ + "addq $0x80, %[filter] \n\t" \ + "addq %[dilateW], %[input] \n\t" \ + "dec %%r9 \n\t" \ + "jg 4b \n\t" \ + "addq %[dilateH], %[input] \n\t" \ + "dec %%rbx \n\t" \ + "jg 3b \n\t" \ + ".align 16 \n\t" \ + "5: \n\t" \ + : "+c" (c.ic) \ + : [input] "r" (c.input), \ + [filter] "r" (c.filter), \ + [bias] "r" (c.bias), \ + [kh] "r" (c.kh), \ + [kw] "r" (c.kw), \ + [stepC16] "r" (c.stepC16), \ + [dilateW] "r" (c.dilateW), \ + [dilateH] "r" (c.dilateH), \ + [fStep] "r" (c.fStep), \ + [f4Step] "r" (c.f4Step) \ + : "%rax", "%rbx", "%r9", \ + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", \ + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", \ + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \ + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \ + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", \ + "%zmm30", "%zmm31", "memory", "cc"); \ } void Avx512ConvKernel12x32(ConvController &c) { - convKernelForLoopXx32(24, 12) + if (c.cross) { + convKernelForLoopXx32(24, 12, 1) + } else { + convKernelForLoopXx32(24, 12, 0) + } - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" "je 0f \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd 0x40(%%rax), %%zmm2, %%zmm2 \n\t" - "vpaddd 0x80(%%rax), %%zmm4, %%zmm4 \n\t" - "vpaddd 0xC0(%%rax), %%zmm6, %%zmm6 \n\t" - "vpaddd 0x100(%%rax), %%zmm8, %%zmm8 \n\t" - "vpaddd 0x140(%%rax), %%zmm10, %%zmm10 \n\t" - "vpaddd 0x180(%%rax), %%zmm12, %%zmm12 \n\t" - "vpaddd 0x1C0(%%rax), %%zmm14, %%zmm14 \n\t" - "vpaddd 0x200(%%rax), %%zmm16, %%zmm16 \n\t" - "vpaddd 0x240(%%rax), %%zmm18, %%zmm18 \n\t" - "vpaddd 0x280(%%rax), %%zmm20, %%zmm20 \n\t" - "vpaddd 0x2C0(%%rax), %%zmm22, %%zmm22 \n\t" - "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" - "vpaddd 0x40(%%rax, %%rbx), %%zmm3, %%zmm3 \n\t" - "vpaddd 0x80(%%rax, %%rbx), %%zmm5, %%zmm5 \n\t" - "vpaddd 0xC0(%%rax, %%rbx), %%zmm7, %%zmm7 \n\t" - "vpaddd 0x100(%%rax, %%rbx), %%zmm9, %%zmm9 \n\t" - "vpaddd 0x140(%%rax, %%rbx), %%zmm11, %%zmm11 \n\t" - "vpaddd 0x180(%%rax, %%rbx), %%zmm13, %%zmm13 \n\t" - "vpaddd 0x1C0(%%rax, %%rbx), %%zmm15, %%zmm15 \n\t" - "vpaddd 0x200(%%rax, %%rbx), %%zmm17, %%zmm17 \n\t" - "vpaddd 0x240(%%rax, %%rbx), %%zmm19, %%zmm19 \n\t" - "vpaddd 0x280(%%rax, %%rbx), %%zmm21, %%zmm21 \n\t" - "vpaddd 0x2C0(%%rax, %%rbx), %%zmm23, %%zmm23 \n\t" + "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" + "vpaddd 0x40(%%rax), %%zmm2, %%zmm2 \n\t" + "vpaddd 0x80(%%rax), %%zmm4, %%zmm4 \n\t" + "vpaddd 0xC0(%%rax), %%zmm6, %%zmm6 \n\t" + "vpaddd 0x100(%%rax), %%zmm8, %%zmm8 \n\t" + "vpaddd 0x140(%%rax), %%zmm10, %%zmm10 \n\t" + "vpaddd 0x180(%%rax), %%zmm12, %%zmm12 \n\t" + "vpaddd 0x1C0(%%rax), %%zmm14, %%zmm14 \n\t" + "vpaddd 0x200(%%rax), %%zmm16, %%zmm16 \n\t" + "vpaddd 0x240(%%rax), %%zmm18, %%zmm18 \n\t" + "vpaddd 0x280(%%rax), %%zmm20, %%zmm20 \n\t" + "vpaddd 0x2C0(%%rax), %%zmm22, %%zmm22 \n\t" + "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" + "vpaddd 0x40(%%rax, %%rbx), %%zmm3, %%zmm3 \n\t" + "vpaddd 0x80(%%rax, %%rbx), %%zmm5, %%zmm5 \n\t" + "vpaddd 0xC0(%%rax, %%rbx), %%zmm7, %%zmm7 \n\t" + "vpaddd 0x100(%%rax, %%rbx), %%zmm9, %%zmm9 \n\t" + "vpaddd 0x140(%%rax, %%rbx), %%zmm11, %%zmm11 \n\t" + "vpaddd 0x180(%%rax, %%rbx), %%zmm13, %%zmm13 \n\t" + "vpaddd 0x1C0(%%rax, %%rbx), %%zmm15, %%zmm15 \n\t" + "vpaddd 0x200(%%rax, %%rbx), %%zmm17, %%zmm17 \n\t" + "vpaddd 0x240(%%rax, %%rbx), %%zmm19, %%zmm19 \n\t" + "vpaddd 0x280(%%rax, %%rbx), %%zmm21, %%zmm21 \n\t" + "vpaddd 0x2C0(%%rax, %%rbx), %%zmm23, %%zmm23 \n\t" ".align 16 \n\t" "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" relu24Regs(%%zmm) + "jmp 4f \n\t" ".align 16 \n\t" "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" convert24RegsI32ToF32(%[scale], %%zmm) ".align 16 \n\t" "2: \n\t" - "vmovups %%zmm0, (%%rax) \n\t" - "vmovups %%zmm2, 0x40(%%rax) \n\t" - "vmovups %%zmm4, 0x80(%%rax) \n\t" - "vmovups %%zmm6, 0xC0(%%rax) \n\t" - "vmovups %%zmm8, 0x100(%%rax) \n\t" - "vmovups %%zmm10, 0x140(%%rax) \n\t" - "vmovups %%zmm12, 0x180(%%rax) \n\t" - "vmovups %%zmm14, 0x1C0(%%rax) \n\t" - "vmovups %%zmm16, 0x200(%%rax) \n\t" - "vmovups %%zmm18, 0x240(%%rax) \n\t" - "vmovups %%zmm20, 0x280(%%rax) \n\t" - "vmovups %%zmm22, 0x2C0(%%rax) \n\t" - "vmovups %%zmm1, (%%rax, %%rbx) \n\t" - "vmovups %%zmm3, 0x40(%%rax, %%rbx) \n\t" - "vmovups %%zmm5, 0x80(%%rax, %%rbx) \n\t" - "vmovups %%zmm7, 0xC0(%%rax, %%rbx) \n\t" - "vmovups %%zmm9, 0x100(%%rax, %%rbx) \n\t" - "vmovups %%zmm11, 0x140(%%rax, %%rbx) \n\t" - "vmovups %%zmm13, 0x180(%%rax, %%rbx) \n\t" - "vmovups %%zmm15, 0x1C0(%%rax, %%rbx) \n\t" - "vmovups %%zmm17, 0x200(%%rax, %%rbx) \n\t" - "vmovups %%zmm19, 0x240(%%rax, %%rbx) \n\t" - "vmovups %%zmm21, 0x280(%%rax, %%rbx) \n\t" - "vmovups %%zmm23, 0x2C0(%%rax, %%rbx) \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + "vaddps 0x40(%[eltwise]), %%zmm2, %%zmm2 \n\t" + "vaddps 0x80(%[eltwise]), %%zmm4, %%zmm4 \n\t" + "vaddps 0xC0(%[eltwise]), %%zmm6, %%zmm6 \n\t" + "vaddps 0x100(%[eltwise]), %%zmm8, %%zmm8 \n\t" + "vaddps 0x140(%[eltwise]), %%zmm10, %%zmm10 \n\t" + "vaddps 0x180(%[eltwise]), %%zmm12, %%zmm12 \n\t" + "vaddps 0x1C0(%[eltwise]), %%zmm14, %%zmm14 \n\t" + "vaddps 0x200(%[eltwise]), %%zmm16, %%zmm16 \n\t" + "vaddps 0x240(%[eltwise]), %%zmm18, %%zmm18 \n\t" + "vaddps 0x280(%[eltwise]), %%zmm20, %%zmm20 \n\t" + "vaddps 0x2C0(%[eltwise]), %%zmm22, %%zmm22 \n\t" + "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1 \n\t" + "vaddps 0x40(%[eltwise], %%rbx), %%zmm3, %%zmm3 \n\t" + "vaddps 0x80(%[eltwise], %%rbx), %%zmm5, %%zmm5 \n\t" + "vaddps 0xC0(%[eltwise], %%rbx), %%zmm7, %%zmm7 \n\t" + "vaddps 0x100(%[eltwise], %%rbx), %%zmm9, %%zmm9 \n\t" + "vaddps 0x140(%[eltwise], %%rbx), %%zmm11, %%zmm11 \n\t" + "vaddps 0x180(%[eltwise], %%rbx), %%zmm13, %%zmm13 \n\t" + "vaddps 0x1C0(%[eltwise], %%rbx), %%zmm15, %%zmm15 \n\t" + "vaddps 0x200(%[eltwise], %%rbx), %%zmm17, %%zmm17 \n\t" + "vaddps 0x240(%[eltwise], %%rbx), %%zmm19, %%zmm19 \n\t" + "vaddps 0x280(%[eltwise], %%rbx), %%zmm21, %%zmm21 \n\t" + "vaddps 0x2C0(%[eltwise], %%rbx), %%zmm23, %%zmm23 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + relu24RegsPs(%%zmm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%zmm0, (%%rax) \n\t" + "vmovups %%zmm2, 0x40(%%rax) \n\t" + "vmovups %%zmm4, 0x80(%%rax) \n\t" + "vmovups %%zmm6, 0xC0(%%rax) \n\t" + "vmovups %%zmm8, 0x100(%%rax) \n\t" + "vmovups %%zmm10, 0x140(%%rax) \n\t" + "vmovups %%zmm12, 0x180(%%rax) \n\t" + "vmovups %%zmm14, 0x1C0(%%rax) \n\t" + "vmovups %%zmm16, 0x200(%%rax) \n\t" + "vmovups %%zmm18, 0x240(%%rax) \n\t" + "vmovups %%zmm20, 0x280(%%rax) \n\t" + "vmovups %%zmm22, 0x2C0(%%rax) \n\t" + "vmovups %%zmm1, (%%rax, %%rbx) \n\t" + "vmovups %%zmm3, 0x40(%%rax, %%rbx) \n\t" + "vmovups %%zmm5, 0x80(%%rax, %%rbx) \n\t" + "vmovups %%zmm7, 0xC0(%%rax, %%rbx) \n\t" + "vmovups %%zmm9, 0x100(%%rax, %%rbx) \n\t" + "vmovups %%zmm11, 0x140(%%rax, %%rbx) \n\t" + "vmovups %%zmm13, 0x180(%%rax, %%rbx) \n\t" + "vmovups %%zmm15, 0x1C0(%%rax, %%rbx) \n\t" + "vmovups %%zmm17, 0x200(%%rax, %%rbx) \n\t" + "vmovups %%zmm19, 0x240(%%rax, %%rbx) \n\t" + "vmovups %%zmm21, 0x280(%%rax, %%rbx) \n\t" + "vmovups %%zmm23, 0x2C0(%%rax, %%rbx) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [eltwise] "r" (c.eltwise), + [ostepC16] "r" (c.ostepC16), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", + "%zmm30", "%zmm31", "memory", "cc"); } void Avx512ConvKernel6x32(ConvController &c) { - convKernelForLoopXx32(12, 6) + if (c.cross) { + convKernelForLoopXx32(24, 12, 1) + } else { + convKernelForLoopXx32(24, 12, 0) + } - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" "je 0f \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd 0x40(%%rax), %%zmm2, %%zmm2 \n\t" - "vpaddd 0x80(%%rax), %%zmm4, %%zmm4 \n\t" - "vpaddd 0xC0(%%rax), %%zmm6, %%zmm6 \n\t" - "vpaddd 0x100(%%rax), %%zmm8, %%zmm8 \n\t" - "vpaddd 0x140(%%rax), %%zmm10, %%zmm10 \n\t" - "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" - "vpaddd 0x40(%%rax, %%rbx), %%zmm3, %%zmm3 \n\t" - "vpaddd 0x80(%%rax, %%rbx), %%zmm5, %%zmm5 \n\t" - "vpaddd 0xC0(%%rax, %%rbx), %%zmm7, %%zmm7 \n\t" - "vpaddd 0x100(%%rax, %%rbx), %%zmm9, %%zmm9 \n\t" - "vpaddd 0x140(%%rax, %%rbx), %%zmm11, %%zmm11 \n\t" + "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" + "vpaddd 0x40(%%rax), %%zmm2, %%zmm2 \n\t" + "vpaddd 0x80(%%rax), %%zmm4, %%zmm4 \n\t" + "vpaddd 0xC0(%%rax), %%zmm6, %%zmm6 \n\t" + "vpaddd 0x100(%%rax), %%zmm8, %%zmm8 \n\t" + "vpaddd 0x140(%%rax), %%zmm10, %%zmm10 \n\t" + "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" + "vpaddd 0x40(%%rax, %%rbx), %%zmm3, %%zmm3 \n\t" + "vpaddd 0x80(%%rax, %%rbx), %%zmm5, %%zmm5 \n\t" + "vpaddd 0xC0(%%rax, %%rbx), %%zmm7, %%zmm7 \n\t" + "vpaddd 0x100(%%rax, %%rbx), %%zmm9, %%zmm9 \n\t" + "vpaddd 0x140(%%rax, %%rbx), %%zmm11, %%zmm11 \n\t" ".align 16 \n\t" "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" relu12Regs(%%zmm) + "jmp 4f \n\t" ".align 16 \n\t" "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" convert12RegsI32ToF32(%[scale], %%zmm) ".align 16 \n\t" "2: \n\t" - "vmovups %%zmm0, (%%rax) \n\t" - "vmovups %%zmm2, 0x40(%%rax) \n\t" - "vmovups %%zmm4, 0x80(%%rax) \n\t" - "vmovups %%zmm6, 0xC0(%%rax) \n\t" - "vmovups %%zmm8, 0x100(%%rax) \n\t" - "vmovups %%zmm10, 0x140(%%rax) \n\t" - "vmovups %%zmm1, (%%rax, %%rbx) \n\t" - "vmovups %%zmm3, 0x40(%%rax, %%rbx) \n\t" - "vmovups %%zmm5, 0x80(%%rax, %%rbx) \n\t" - "vmovups %%zmm7, 0xC0(%%rax, %%rbx) \n\t" - "vmovups %%zmm9, 0x100(%%rax, %%rbx) \n\t" - "vmovups %%zmm11, 0x140(%%rax, %%rbx) \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + "vaddps 0x40(%[eltwise]), %%zmm2, %%zmm2 \n\t" + "vaddps 0x80(%[eltwise]), %%zmm4, %%zmm4 \n\t" + "vaddps 0xC0(%[eltwise]), %%zmm6, %%zmm6 \n\t" + "vaddps 0x100(%[eltwise]), %%zmm8, %%zmm8 \n\t" + "vaddps 0x140(%[eltwise]), %%zmm10, %%zmm10 \n\t" + "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1 \n\t" + "vaddps 0x40(%[eltwise], %%rbx), %%zmm3, %%zmm3 \n\t" + "vaddps 0x80(%[eltwise], %%rbx), %%zmm5, %%zmm5 \n\t" + "vaddps 0xC0(%[eltwise], %%rbx), %%zmm7, %%zmm7 \n\t" + "vaddps 0x100(%[eltwise], %%rbx), %%zmm9, %%zmm9 \n\t" + "vaddps 0x140(%[eltwise], %%rbx), %%zmm11, %%zmm11 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + relu12RegsPs(%%zmm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%zmm0, (%%rax) \n\t" + "vmovups %%zmm2, 0x40(%%rax) \n\t" + "vmovups %%zmm4, 0x80(%%rax) \n\t" + "vmovups %%zmm6, 0xC0(%%rax) \n\t" + "vmovups %%zmm8, 0x100(%%rax) \n\t" + "vmovups %%zmm10, 0x140(%%rax) \n\t" + "vmovups %%zmm1, (%%rax, %%rbx) \n\t" + "vmovups %%zmm3, 0x40(%%rax, %%rbx) \n\t" + "vmovups %%zmm5, 0x80(%%rax, %%rbx) \n\t" + "vmovups %%zmm7, 0xC0(%%rax, %%rbx) \n\t" + "vmovups %%zmm9, 0x100(%%rax, %%rbx) \n\t" + "vmovups %%zmm11, 0x140(%%rax, %%rbx) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [ostepC16] "r" (c.ostepC16), + [eltwise] "r" (c.eltwise), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", + "%zmm24", "%zmm31", "memory", "cc"); } void Avx512ConvKernel1x32(ConvController &c) { - convKernelForLoopXx32(2, 1) - - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" - "je 0f \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" - - ".align 16 \n\t" - "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + convKernelForLoopXx32(24, 12, 0) + + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" + "je 0f \n\t" + "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" + "vpaddd (%%rax, %%rbx), %%zmm1, %%zmm1 \n\t" + + ".align 16 \n\t" + "0: \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" relu2Regs(%%zmm) + "jmp 4f \n\t" - ".align 16 \n\t" - "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" + ".align 16 \n\t" + "1: \n\t" convert2RegsI32ToF32(%[scale], %%zmm) - ".align 16 \n\t" - "2: \n\t" - "vmovups %%zmm0, (%%rax) \n\t" - "vmovups %%zmm1, (%%rax, %%rbx) \n\t" + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + "vaddps (%[eltwise], %%rbx), %%zmm1, %%zmm1 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + relu2RegsPs(%%zmm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%zmm0, (%%rax) \n\t" + "vmovups %%zmm1, (%%rax, %%rbx) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [eltwise] "r" (c.eltwise), + [ostepC16] "r" (c.ostepC16), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%zmm0", "%zmm1", "%zmm24", "%zmm31", + "memory", "cc"); } -#define load16BiasTo1Regs(bias, rtype) \ - "vmovups ("#bias"), "#rtype"0 \n\t" - -#define load16BiasTo12Regs(bias, rtype) \ - load16BiasTo1Regs(bias, rtype) \ - "vmovups "#rtype"0, "#rtype"1 \n\t" \ - "vmovups "#rtype"0, "#rtype"2 \n\t" \ - "vmovups "#rtype"0, "#rtype"3 \n\t" \ - "vmovups "#rtype"0, "#rtype"4 \n\t" \ - "vmovups "#rtype"0, "#rtype"5 \n\t" \ - "vmovups "#rtype"0, "#rtype"6 \n\t" \ - "vmovups "#rtype"0, "#rtype"7 \n\t" \ - "vmovups "#rtype"0, "#rtype"8 \n\t" \ - "vmovups "#rtype"0, "#rtype"9 \n\t" \ - "vmovups "#rtype"0, "#rtype"10 \n\t" \ - "vmovups "#rtype"0, "#rtype"11 \n\t" - -#define load16BiasTo24Regs(bias, rtype) \ - load16BiasTo12Regs(bias, rtype) \ - "vmovups "#rtype"0, "#rtype"12 \n\t" \ - "vmovups "#rtype"0, "#rtype"13 \n\t" \ - "vmovups "#rtype"0, "#rtype"14 \n\t" \ - "vmovups "#rtype"0, "#rtype"15 \n\t" \ - "vmovups "#rtype"0, "#rtype"16 \n\t" \ - "vmovups "#rtype"0, "#rtype"17 \n\t" \ - "vmovups "#rtype"0, "#rtype"18 \n\t" \ - "vmovups "#rtype"0, "#rtype"19 \n\t" \ - "vmovups "#rtype"0, "#rtype"20 \n\t" \ - "vmovups "#rtype"0, "#rtype"21 \n\t" \ - "vmovups "#rtype"0, "#rtype"22 \n\t" \ - "vmovups "#rtype"0, "#rtype"23 \n\t" - #ifdef _USE_AVX512_VNNI -#define convKernel24x16c4(input, freg0, off0, preg0, rtype) \ - "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"27 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"28 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpbroadcastd ("#input"), "#rtype"29 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"30 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"31 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ - "vpdpbusd "#freg0", "#rtype"26, "#rtype"0 \n\t" \ - "vpdpbusd "#freg0", "#rtype"27, "#rtype"1 \n\t" \ - "vpdpbusd "#freg0", "#rtype"28, "#rtype"2 \n\t" \ - "vpdpbusd "#freg0", "#rtype"29, "#rtype"3 \n\t" \ - "vpdpbusd "#freg0", "#rtype"30, "#rtype"4 \n\t" \ - "vpdpbusd "#freg0", "#rtype"31, "#rtype"5 \n\t" \ - "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"27 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"28 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpbroadcastd ("#input"), "#rtype"29 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"30 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"31 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpdpbusd "#freg0", "#rtype"26, "#rtype"6 \n\t" \ - "vpdpbusd "#freg0", "#rtype"27, "#rtype"7 \n\t" \ - "vpdpbusd "#freg0", "#rtype"28, "#rtype"8 \n\t" \ - "vpdpbusd "#freg0", "#rtype"29, "#rtype"9 \n\t" \ - "vpdpbusd "#freg0", "#rtype"30, "#rtype"10 \n\t" \ - "vpdpbusd "#freg0", "#rtype"31, "#rtype"11 \n\t" \ - "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"27 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"28 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpbroadcastd ("#input"), "#rtype"29 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"30 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"31 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpdpbusd "#freg0", "#rtype"26, "#rtype"12 \n\t" \ - "vpdpbusd "#freg0", "#rtype"27, "#rtype"13 \n\t" \ - "vpdpbusd "#freg0", "#rtype"28, "#rtype"14 \n\t" \ - "vpdpbusd "#freg0", "#rtype"29, "#rtype"15 \n\t" \ - "vpdpbusd "#freg0", "#rtype"30, "#rtype"16 \n\t" \ - "vpdpbusd "#freg0", "#rtype"31, "#rtype"17 \n\t" \ - "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"27 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"28 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpbroadcastd ("#input"), "#rtype"29 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"30 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"31 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpdpbusd "#freg0", "#rtype"26, "#rtype"18 \n\t" \ - "vpdpbusd "#freg0", "#rtype"27, "#rtype"19 \n\t" \ - "vpdpbusd "#freg0", "#rtype"28, "#rtype"20 \n\t" \ - "vpdpbusd "#freg0", "#rtype"29, "#rtype"21 \n\t" \ - "vpdpbusd "#freg0", "#rtype"30, "#rtype"22 \n\t" \ - "vpdpbusd "#freg0", "#rtype"31, "#rtype"23 \n\t" - -#define convKernel12x16c4(input, freg0, off0, preg0, rtype) \ - "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"27 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"28 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpbroadcastd ("#input"), "#rtype"29 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"30 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"31 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ - "vpdpbusd "#freg0", "#rtype"26, "#rtype"0 \n\t" \ - "vpdpbusd "#freg0", "#rtype"27, "#rtype"1 \n\t" \ - "vpdpbusd "#freg0", "#rtype"28, "#rtype"2 \n\t" \ - "vpdpbusd "#freg0", "#rtype"29, "#rtype"3 \n\t" \ - "vpdpbusd "#freg0", "#rtype"30, "#rtype"4 \n\t" \ - "vpdpbusd "#freg0", "#rtype"31, "#rtype"5 \n\t" \ - "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"27 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"28 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpbroadcastd ("#input"), "#rtype"29 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"30 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"31 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpdpbusd "#freg0", "#rtype"26, "#rtype"6 \n\t" \ - "vpdpbusd "#freg0", "#rtype"27, "#rtype"7 \n\t" \ - "vpdpbusd "#freg0", "#rtype"28, "#rtype"8 \n\t" \ - "vpdpbusd "#freg0", "#rtype"29, "#rtype"9 \n\t" \ - "vpdpbusd "#freg0", "#rtype"30, "#rtype"10 \n\t" \ - "vpdpbusd "#freg0", "#rtype"31, "#rtype"11 \n\t" - -#define convKernel1x16c4(input, freg0, off0, preg0, rtype) \ - "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ - "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ - "vpdpbusd "#freg0", "#rtype"26, "#rtype"0 \n\t" +#define convKernel24x16c4_1(input, freg0, off0, preg0, rtype) \ + "movq (%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"27 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x8(%[stepC16]), "#input" \n\t" \ + "movq 0x10(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"28 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"29 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x18(%[stepC16]), "#input" \n\t" \ + "movq 0x20(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"30 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"31 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"0 \n\t" \ + "vpdpbusd "#freg0", "#rtype"27, "#rtype"1 \n\t" \ + "vpdpbusd "#freg0", "#rtype"28, "#rtype"2 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x28(%[stepC16]), "#input" \n\t" \ + "movq 0x30(%[stepC16]), %%r10 \n\t" \ + "vpdpbusd "#freg0", "#rtype"29, "#rtype"3 \n\t" \ + "vpdpbusd "#freg0", "#rtype"30, "#rtype"4 \n\t" \ + "vpdpbusd "#freg0", "#rtype"31, "#rtype"5 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"27 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x38(%[stepC16]), "#input" \n\t" \ + "movq 0x40(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"28 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"29 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x48(%[stepC16]), "#input" \n\t" \ + "movq 0x50(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"30 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"31 \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"6 \n\t" \ + "vpdpbusd "#freg0", "#rtype"27, "#rtype"7 \n\t" \ + "vpdpbusd "#freg0", "#rtype"28, "#rtype"8 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x58(%[stepC16]), "#input" \n\t" \ + "movq 0x60(%[stepC16]), %%r10 \n\t" \ + "vpdpbusd "#freg0", "#rtype"29, "#rtype"9 \n\t" \ + "vpdpbusd "#freg0", "#rtype"30, "#rtype"10 \n\t" \ + "vpdpbusd "#freg0", "#rtype"31, "#rtype"11 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"27 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x68(%[stepC16]), "#input" \n\t" \ + "movq 0x70(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"28 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"29 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x78(%[stepC16]), "#input" \n\t" \ + "movq 0x80(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"30 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"31 \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"12 \n\t" \ + "vpdpbusd "#freg0", "#rtype"27, "#rtype"13 \n\t" \ + "vpdpbusd "#freg0", "#rtype"28, "#rtype"14 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x88(%[stepC16]), "#input" \n\t" \ + "movq 0x90(%[stepC16]), %%r10 \n\t" \ + "vpdpbusd "#freg0", "#rtype"29, "#rtype"15 \n\t" \ + "vpdpbusd "#freg0", "#rtype"30, "#rtype"16 \n\t" \ + "vpdpbusd "#freg0", "#rtype"31, "#rtype"17 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"27 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x98(%[stepC16]), "#input" \n\t" \ + "movq 0xA0(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"28 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"29 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0xA8(%[stepC16]), "#input" \n\t" \ + "movq 0xB0(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"30 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"31 \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"18 \n\t" \ + "vpdpbusd "#freg0", "#rtype"27, "#rtype"19 \n\t" \ + "vpdpbusd "#freg0", "#rtype"28, "#rtype"20 \n\t" \ + "vpdpbusd "#freg0", "#rtype"29, "#rtype"21 \n\t" \ + "vpdpbusd "#freg0", "#rtype"30, "#rtype"22 \n\t" \ + "vpdpbusd "#freg0", "#rtype"31, "#rtype"23 \n\t" + +#define convKernel12x16c4_1(input, freg0, off0, preg0, rtype) \ + "movq (%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"27 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x8(%[stepC16]), "#input" \n\t" \ + "movq 0x10(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"28 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"29 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x18(%[stepC16]), "#input" \n\t" \ + "movq 0x20(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"30 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"31 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"0 \n\t" \ + "vpdpbusd "#freg0", "#rtype"27, "#rtype"1 \n\t" \ + "vpdpbusd "#freg0", "#rtype"28, "#rtype"2 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x28(%[stepC16]), "#input" \n\t" \ + "movq 0x30(%[stepC16]), %%r10 \n\t" \ + "vpdpbusd "#freg0", "#rtype"29, "#rtype"3 \n\t" \ + "vpdpbusd "#freg0", "#rtype"30, "#rtype"4 \n\t" \ + "vpdpbusd "#freg0", "#rtype"31, "#rtype"5 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"27 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x38(%[stepC16]), "#input" \n\t" \ + "movq 0x40(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"28 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"29 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x48(%[stepC16]), "#input" \n\t" \ + "movq 0x50(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"30 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"31 \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"6 \n\t" \ + "vpdpbusd "#freg0", "#rtype"27, "#rtype"7 \n\t" \ + "vpdpbusd "#freg0", "#rtype"28, "#rtype"8 \n\t" \ + "vpdpbusd "#freg0", "#rtype"29, "#rtype"9 \n\t" \ + "vpdpbusd "#freg0", "#rtype"30, "#rtype"10 \n\t" \ + "vpdpbusd "#freg0", "#rtype"31, "#rtype"11 \n\t" + +#define convKernel1x16c4_1(input, freg0, off0, preg0, rtype) \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"0 \n\t" + +#define convKernel24x16c4_0(input, freg0, off0, preg0, rtype) \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"27 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"28 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpbroadcastd ("#input"), "#rtype"29 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"30 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"31 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"0 \n\t" \ + "vpdpbusd "#freg0", "#rtype"27, "#rtype"1 \n\t" \ + "vpdpbusd "#freg0", "#rtype"28, "#rtype"2 \n\t" \ + "vpdpbusd "#freg0", "#rtype"29, "#rtype"3 \n\t" \ + "vpdpbusd "#freg0", "#rtype"30, "#rtype"4 \n\t" \ + "vpdpbusd "#freg0", "#rtype"31, "#rtype"5 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"27 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"28 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpbroadcastd ("#input"), "#rtype"29 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"30 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"31 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"6 \n\t" \ + "vpdpbusd "#freg0", "#rtype"27, "#rtype"7 \n\t" \ + "vpdpbusd "#freg0", "#rtype"28, "#rtype"8 \n\t" \ + "vpdpbusd "#freg0", "#rtype"29, "#rtype"9 \n\t" \ + "vpdpbusd "#freg0", "#rtype"30, "#rtype"10 \n\t" \ + "vpdpbusd "#freg0", "#rtype"31, "#rtype"11 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"27 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"28 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpbroadcastd ("#input"), "#rtype"29 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"30 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"31 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"12 \n\t" \ + "vpdpbusd "#freg0", "#rtype"27, "#rtype"13 \n\t" \ + "vpdpbusd "#freg0", "#rtype"28, "#rtype"14 \n\t" \ + "vpdpbusd "#freg0", "#rtype"29, "#rtype"15 \n\t" \ + "vpdpbusd "#freg0", "#rtype"30, "#rtype"16 \n\t" \ + "vpdpbusd "#freg0", "#rtype"31, "#rtype"17 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"27 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"28 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpbroadcastd ("#input"), "#rtype"29 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"30 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"31 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"18 \n\t" \ + "vpdpbusd "#freg0", "#rtype"27, "#rtype"19 \n\t" \ + "vpdpbusd "#freg0", "#rtype"28, "#rtype"20 \n\t" \ + "vpdpbusd "#freg0", "#rtype"29, "#rtype"21 \n\t" \ + "vpdpbusd "#freg0", "#rtype"30, "#rtype"22 \n\t" \ + "vpdpbusd "#freg0", "#rtype"31, "#rtype"23 \n\t" + +#define convKernel12x16c4_0(input, freg0, off0, preg0, rtype) \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"27 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"28 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpbroadcastd ("#input"), "#rtype"29 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"30 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"31 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"0 \n\t" \ + "vpdpbusd "#freg0", "#rtype"27, "#rtype"1 \n\t" \ + "vpdpbusd "#freg0", "#rtype"28, "#rtype"2 \n\t" \ + "vpdpbusd "#freg0", "#rtype"29, "#rtype"3 \n\t" \ + "vpdpbusd "#freg0", "#rtype"30, "#rtype"4 \n\t" \ + "vpdpbusd "#freg0", "#rtype"31, "#rtype"5 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"27 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"28 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpbroadcastd ("#input"), "#rtype"29 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"30 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"31 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"6 \n\t" \ + "vpdpbusd "#freg0", "#rtype"27, "#rtype"7 \n\t" \ + "vpdpbusd "#freg0", "#rtype"28, "#rtype"8 \n\t" \ + "vpdpbusd "#freg0", "#rtype"29, "#rtype"9 \n\t" \ + "vpdpbusd "#freg0", "#rtype"30, "#rtype"10 \n\t" \ + "vpdpbusd "#freg0", "#rtype"31, "#rtype"11 \n\t" + +#define convKernel1x16c4_0(input, freg0, off0, preg0, rtype) \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vmovups "#off0"(%[filter]), "#preg0" \n\t" \ + "vpdpbusd "#freg0", "#rtype"26, "#rtype"0 \n\t" + #else #define convKernel24x16c4_3(input, freg0, off0, preg0, rtype) \ - "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"26 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"26 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27 \n\t" \ - "vpaddd "#rtype"0, "#rtype"28, "#rtype"0 \n\t" \ - "vpaddd "#rtype"1, "#rtype"29, "#rtype"1 \n\t" \ - "vpaddd "#rtype"2, "#rtype"30, "#rtype"2 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"26 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27 \n\t" \ - "vpaddd "#rtype"3, "#rtype"28, "#rtype"3 \n\t" \ - "vpaddd "#rtype"4, "#rtype"29, "#rtype"4 \n\t" \ - "vpaddd "#rtype"5, "#rtype"30, "#rtype"5 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"26 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27 \n\t" \ - "vpaddd "#rtype"6, "#rtype"28, "#rtype"6 \n\t" \ - "vpaddd "#rtype"7, "#rtype"29, "#rtype"7 \n\t" \ - "vpaddd "#rtype"8, "#rtype"30, "#rtype"8 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"26 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27 \n\t" \ - "vpaddd "#rtype"9, "#rtype"28, "#rtype"9 \n\t" \ - "vpaddd "#rtype"10, "#rtype"29, "#rtype"10 \n\t" \ - "vpaddd "#rtype"11, "#rtype"30, "#rtype"11 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"26 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27 \n\t" \ - "vpaddd "#rtype"12, "#rtype"28, "#rtype"12 \n\t" \ - "vpaddd "#rtype"13, "#rtype"29, "#rtype"13 \n\t" \ - "vpaddd "#rtype"14, "#rtype"30, "#rtype"14 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"26 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27 \n\t" \ - "vpaddd "#rtype"15, "#rtype"28, "#rtype"15 \n\t" \ - "vpaddd "#rtype"16, "#rtype"29, "#rtype"16 \n\t" \ - "vpaddd "#rtype"17, "#rtype"30, "#rtype"17 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"26 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27 \n\t" \ - "vpaddd "#rtype"18, "#rtype"28, "#rtype"18 \n\t" \ - "vpaddd "#rtype"19, "#rtype"29, "#rtype"19 \n\t" \ - "vpaddd "#rtype"20, "#rtype"30, "#rtype"20 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ - "vpaddd "#rtype"21, "#rtype"28, "#rtype"21 \n\t" \ - "vpaddd "#rtype"22, "#rtype"29, "#rtype"22 \n\t" \ - "vpaddd "#rtype"23, "#rtype"30, "#rtype"23 \n\t" + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"27 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"27 \n\t" \ + "vpaddd "#rtype"0, "#rtype"28, "#rtype"0 \n\t" \ + "vpaddd "#rtype"1, "#rtype"29, "#rtype"1 \n\t" \ + "vpaddd "#rtype"2, "#rtype"30, "#rtype"2 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"27 \n\t" \ + "vpaddd "#rtype"3, "#rtype"28, "#rtype"3 \n\t" \ + "vpaddd "#rtype"4, "#rtype"29, "#rtype"4 \n\t" \ + "vpaddd "#rtype"5, "#rtype"30, "#rtype"5 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"27 \n\t" \ + "vpaddd "#rtype"6, "#rtype"28, "#rtype"6 \n\t" \ + "vpaddd "#rtype"7, "#rtype"29, "#rtype"7 \n\t" \ + "vpaddd "#rtype"8, "#rtype"30, "#rtype"8 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"27 \n\t" \ + "vpaddd "#rtype"9, "#rtype"28, "#rtype"9 \n\t" \ + "vpaddd "#rtype"10, "#rtype"29, "#rtype"10 \n\t" \ + "vpaddd "#rtype"11, "#rtype"30, "#rtype"11 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"27 \n\t" \ + "vpaddd "#rtype"12, "#rtype"28, "#rtype"12 \n\t" \ + "vpaddd "#rtype"13, "#rtype"29, "#rtype"13 \n\t" \ + "vpaddd "#rtype"14, "#rtype"30, "#rtype"14 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"27 \n\t" \ + "vpaddd "#rtype"15, "#rtype"28, "#rtype"15 \n\t" \ + "vpaddd "#rtype"16, "#rtype"29, "#rtype"16 \n\t" \ + "vpaddd "#rtype"17, "#rtype"30, "#rtype"17 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"27 \n\t" \ + "vpaddd "#rtype"18, "#rtype"28, "#rtype"18 \n\t" \ + "vpaddd "#rtype"19, "#rtype"29, "#rtype"19 \n\t" \ + "vpaddd "#rtype"20, "#rtype"30, "#rtype"20 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpaddd "#rtype"21, "#rtype"28, "#rtype"21 \n\t" \ + "vpaddd "#rtype"22, "#rtype"29, "#rtype"22 \n\t" \ + "vpaddd "#rtype"23, "#rtype"30, "#rtype"23 \n\t" #define convKernel12x16c4_3(input, freg0, off0, preg0, rtype) \ - "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"26 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"26 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27 \n\t" \ - "vpaddd "#rtype"0, "#rtype"28, "#rtype"0 \n\t" \ - "vpaddd "#rtype"1, "#rtype"29, "#rtype"1 \n\t" \ - "vpaddd "#rtype"2, "#rtype"30, "#rtype"2 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"26 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27 \n\t" \ - "vpaddd "#rtype"3, "#rtype"28, "#rtype"3 \n\t" \ - "vpaddd "#rtype"4, "#rtype"29, "#rtype"4 \n\t" \ - "vpaddd "#rtype"5, "#rtype"30, "#rtype"5 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "addq %[stepC16], "#input" \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ - "vpbroadcastd ("#input", %[stepC16]), "#rtype"26 \n\t" \ - "vpbroadcastd ("#input", %[stepC16], 2), "#rtype"27 \n\t" \ - "vpaddd "#rtype"6, "#rtype"28, "#rtype"6 \n\t" \ - "vpaddd "#rtype"7, "#rtype"29, "#rtype"7 \n\t" \ - "vpaddd "#rtype"8, "#rtype"30, "#rtype"8 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ - "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ - "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ - "vpaddd "#rtype"9, "#rtype"28, "#rtype"9 \n\t" \ - "vpaddd "#rtype"10, "#rtype"29, "#rtype"10 \n\t" \ - "vpaddd "#rtype"11, "#rtype"30, "#rtype"11 \n\t" + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"27 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"27 \n\t" \ + "vpaddd "#rtype"0, "#rtype"28, "#rtype"0 \n\t" \ + "vpaddd "#rtype"1, "#rtype"29, "#rtype"1 \n\t" \ + "vpaddd "#rtype"2, "#rtype"30, "#rtype"2 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"27 \n\t" \ + "vpaddd "#rtype"3, "#rtype"28, "#rtype"3 \n\t" \ + "vpaddd "#rtype"4, "#rtype"29, "#rtype"4 \n\t" \ + "vpaddd "#rtype"5, "#rtype"30, "#rtype"5 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq %%r10, "#input" \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10, 2), "#rtype"27 \n\t" \ + "vpaddd "#rtype"6, "#rtype"28, "#rtype"6 \n\t" \ + "vpaddd "#rtype"7, "#rtype"29, "#rtype"7 \n\t" \ + "vpaddd "#rtype"8, "#rtype"30, "#rtype"8 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpaddd "#rtype"9, "#rtype"28, "#rtype"9 \n\t" \ + "vpaddd "#rtype"10, "#rtype"29, "#rtype"10 \n\t" \ + "vpaddd "#rtype"11, "#rtype"30, "#rtype"11 \n\t" #define convKernel1x16c4_3(input, freg0, off0, preg0, rtype) \ - "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ - "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ - "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ - "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ - "vpaddd "#rtype"0, "#rtype"28, "#rtype"0 \n\t" - -#define convKernel24x16c4(input, freg0, off0, preg0, rtype) \ + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpaddd "#rtype"0, "#rtype"28, "#rtype"0 \n\t" + +#define convKernel24x16c4_4(input, freg0, off0, preg0, rtype) \ + "movq (%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"26 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x8(%[stepC16]), "#input" \n\t" \ + "movq 0x10(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"27 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"25 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x18(%[stepC16]), "#input" \n\t" \ + "movq 0x20(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"27 \n\t" \ + "vpaddd "#rtype"0, "#rtype"28, "#rtype"0 \n\t" \ + "vpaddd "#rtype"1, "#rtype"29, "#rtype"1 \n\t" \ + "vpaddd "#rtype"2, "#rtype"30, "#rtype"2 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x28(%[stepC16]), "#input" \n\t" \ + "movq 0x30(%[stepC16]), %%r10 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"26 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x38(%[stepC16]), "#input" \n\t" \ + "movq 0x40(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"27 \n\t" \ + "vpaddd "#rtype"3, "#rtype"28, "#rtype"3 \n\t" \ + "vpaddd "#rtype"4, "#rtype"29, "#rtype"4 \n\t" \ + "vpaddd "#rtype"5, "#rtype"30, "#rtype"5 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"25 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x38(%[stepC16]), "#input" \n\t" \ + "movq 0x40(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"27 \n\t" \ + "vpaddd "#rtype"6, "#rtype"28, "#rtype"6 \n\t" \ + "vpaddd "#rtype"7, "#rtype"29, "#rtype"7 \n\t" \ + "vpaddd "#rtype"8, "#rtype"30, "#rtype"8 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x48(%[stepC16]), "#input" \n\t" \ + "movq 0x50(%[stepC16]), %%r10 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"26 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x58(%[stepC16]), "#input" \n\t" \ + "movq 0x60(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"27 \n\t" \ + "vpaddd "#rtype"9, "#rtype"28, "#rtype"9 \n\t" \ + "vpaddd "#rtype"10, "#rtype"29, "#rtype"10 \n\t" \ + "vpaddd "#rtype"11, "#rtype"30, "#rtype"11 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"25 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x68(%[stepC16]), "#input" \n\t" \ + "movq 0x70(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"27 \n\t" \ + "vpaddd "#rtype"12, "#rtype"28, "#rtype"12 \n\t" \ + "vpaddd "#rtype"13, "#rtype"29, "#rtype"13 \n\t" \ + "vpaddd "#rtype"14, "#rtype"30, "#rtype"14 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x78(%[stepC16]), "#input" \n\t" \ + "movq 0x80(%[stepC16]), %%r10 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"26 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x88(%[stepC16]), "#input" \n\t" \ + "movq 0x90(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"27 \n\t" \ + "vpaddd "#rtype"15, "#rtype"28, "#rtype"15 \n\t" \ + "vpaddd "#rtype"16, "#rtype"29, "#rtype"16 \n\t" \ + "vpaddd "#rtype"17, "#rtype"30, "#rtype"17 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"25 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x98(%[stepC16]), "#input" \n\t" \ + "movq 0xA0(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"27 \n\t" \ + "vpaddd "#rtype"18, "#rtype"28, "#rtype"18 \n\t" \ + "vpaddd "#rtype"19, "#rtype"29, "#rtype"19 \n\t" \ + "vpaddd "#rtype"20, "#rtype"30, "#rtype"20 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpaddd "#rtype"21, "#rtype"28, "#rtype"21 \n\t" \ + "vpaddd "#rtype"22, "#rtype"29, "#rtype"22 \n\t" \ + "vpaddd "#rtype"23, "#rtype"30, "#rtype"23 \n\t" + +#define convKernel12x16c4_4(input, freg0, off0, preg0, rtype) \ + "movq (%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"26 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x8(%[stepC16]), "#input" \n\t" \ + "movq 0x10(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"27 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"25 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x18(%[stepC16]), "#input" \n\t" \ + "movq 0x20(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"27 \n\t" \ + "vpaddd "#rtype"0, "#rtype"28, "#rtype"0 \n\t" \ + "vpaddd "#rtype"1, "#rtype"29, "#rtype"1 \n\t" \ + "vpaddd "#rtype"2, "#rtype"30, "#rtype"2 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x28(%[stepC16]), "#input" \n\t" \ + "movq 0x30(%[stepC16]), %%r10 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"26 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x38(%[stepC16]), "#input" \n\t" \ + "movq 0x40(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"27 \n\t" \ + "vpaddd "#rtype"3, "#rtype"28, "#rtype"3 \n\t" \ + "vpaddd "#rtype"4, "#rtype"29, "#rtype"4 \n\t" \ + "vpaddd "#rtype"5, "#rtype"30, "#rtype"5 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"25 \n\t" \ + "addq %%r10, "#input" \n\t" \ + "addq 0x48(%[stepC16]), "#input" \n\t" \ + "movq 0x50(%[stepC16]), %%r10 \n\t" \ + "vpbroadcastd ("#input"), "#rtype"26 \n\t" \ + "vpbroadcastd ("#input", %%r10), "#rtype"27 \n\t" \ + "vpaddd "#rtype"6, "#rtype"28, "#rtype"6 \n\t" \ + "vpaddd "#rtype"7, "#rtype"29, "#rtype"7 \n\t" \ + "vpaddd "#rtype"8, "#rtype"30, "#rtype"8 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"26, "#rtype"29 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"27, "#rtype"30 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"29, "#rtype"31, "#rtype"29 \n\t" \ + "vpmaddwd "#rtype"30, "#rtype"31, "#rtype"30 \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpaddd "#rtype"9, "#rtype"28, "#rtype"9 \n\t" \ + "vpaddd "#rtype"10, "#rtype"29, "#rtype"10 \n\t" \ + "vpaddd "#rtype"11, "#rtype"30, "#rtype"11 \n\t" + +#define convKernel1x16c4_4(input, freg0, off0, preg0, rtype) \ + "vpbroadcastd ("#input"), "#rtype"25 \n\t" \ + "vpmaddubsw "#freg0", "#rtype"25, "#rtype"28 \n\t" \ + "vpmaddwd "#rtype"28, "#rtype"31, "#rtype"28 \n\t" \ + "vmovups "#off0"(%[filter]), "#freg0" \n\t" \ + "vpaddd "#rtype"0, "#rtype"28, "#rtype"0 \n\t" + +#define convKernel24x16c4_0(input, freg0, off0, preg0, rtype) \ convKernel24x16c4_3(input, rtype##24, off0, rtype##25, rtype) -#define convKernel12x16c4(input, freg0, off0, preg0, rtype) \ +#define convKernel12x16c4_0(input, freg0, off0, preg0, rtype) \ convKernel12x16c4_3(input, rtype##24, off0, rtype##25, rtype) -#define convKernel1x16c4(input, freg0, off0, preg0, rtype) \ +#define convKernel1x16c4_0(input, freg0, off0, preg0, rtype) \ convKernel1x16c4_3(input, rtype##24, off0, rtype##25, rtype) + +#define convKernel24x16c4_1(input, freg0, off0, preg0, rtype) \ + convKernel24x16c4_4(input, rtype##24, off0, rtype##25, rtype) + +#define convKernel12x16c4_1(input, freg0, off0, preg0, rtype) \ + convKernel12x16c4_4(input, rtype##24, off0, rtype##25, rtype) + +#define convKernel1x16c4_1(input, freg0, off0, preg0, rtype) \ + convKernel1x16c4_4(input, rtype##24, off0, rtype##25, rtype) + #endif -#define convKernelForLoopXx16(rnum, wsize, rtype, off0, off1, off2, off3, off4) \ - __asm__ __volatile__("vmovups (%[filter]), "#rtype"24 \n\t" \ - "addq $"#off1", %[filter] \n\t" \ - "mov $1, %%eax \n\t" \ - "vmovd %%eax, %%xmm0 \n\t" \ - "vpbroadcastw %%xmm0, "#rtype"31 \n\t" \ - "movq %[flags], %%rax \n\t" \ - "andq $0x1, %%rax \n\t" \ - "jne 0f \n\t" \ - load16BiasTo##rnum##Regs(%[bias], rtype) \ - "cmpq $0x10, %%rcx \n\t" \ - "jl 4f \n\t" \ - "jmp 1f \n\t" \ - ".align 16 \n\t" \ - "0: \n\t" \ - clear##rnum##Regs(rtype) \ - "cmpq $0x10, %%rcx \n\t" \ - "jl 4f \n\t" \ - ".align 16 \n\t" \ - "1: \n\t" \ - "mov %[kh], %%rbx \n\t" \ - ".align 16 \n\t" \ - "2: \n\t" \ - "mov %[kw], %%r9 \n\t" \ - ".align 16 \n\t" \ - "3: \n\t" \ - "movq %[input], %%rax \n\t" \ - convKernel##wsize##x16c4(%%rax, rtype##24, off0, rtype##25, rtype) \ - "movq %[input], %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - convKernel##wsize##x16c4(%%rax, rtype##25, off1, rtype##24, rtype) \ - "movq %[input], %%rax \n\t" \ - "addq $0x8, %%rax \n\t" \ - convKernel##wsize##x16c4(%%rax, rtype##24, off2, rtype##25, rtype) \ - "movq %[input], %%rax \n\t" \ - "addq $0xC, %%rax \n\t" \ - convKernel##wsize##x16c4(%%rax, rtype##25, off3, rtype##24, rtype) \ - "addq $"#off4", %[filter] \n\t" \ - "addq %[dilateW], %[input] \n\t" \ - "dec %%r9 \n\t" \ - "jg 3b \n\t" \ - "addq %[dilateH], %[input] \n\t" \ - "dec %%rbx \n\t" \ - "jg 2b \n\t" \ - "addq %[fStep], %[input] \n\t" \ - "subq $0x10, %%rcx \n\t" \ - "cmpq $0x10, %%rcx \n\t" \ - "jge 1b \n\t" \ - "subq %[fStep], %[input] \n\t" \ - "addq %[f8Step], %[input] \n\t" \ - ".align 16 \n\t" \ - "4: \n\t" \ - : "+c" (c.ic), [input] "+r" (c.input), [filter] "+r" (c.filter) \ - : [bias] "r" (c.bias), [kh] "r" (c.kh), [kw] "r" (c.kw), \ - [stepC16] "r" (c.stepC16), [dilateW] "r" (c.dilateW), \ - [dilateH] "r" (c.dilateH), [fStep] "r" (c.fStep), [flags] "r" (c.flags), \ - [f8Step] "r" (c.f8Step) \ - : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \ - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", \ - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \ - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \ - "%zmm31", "memory", "cc"); \ - if (c.ic > 0) { \ - __asm__ __volatile__("cmpq $0x8, %%rcx \n\t" \ - "jl 2f \n\t" \ - "subq $0x8, %%rcx \n\t" \ - "shr $1, %[dilateW] \n\t" \ - "shr $1, %[dilateH] \n\t" \ - "shr $1, %[fStep] \n\t" \ - "shr $1, %[stepC16] \n\t" \ - "mov %[kh], %%rbx \n\t" \ - ".align 16 \n\t" \ - "0: \n\t" \ - "mov %[kw], %%r9 \n\t" \ - ".align 16 \n\t" \ - "1: \n\t" \ - "movq %[input], %%rax \n\t" \ - convKernel##wsize##x16c4(%%rax, rtype##24, off0, rtype##25, rtype) \ - "movq %[input], %%rax \n\t" \ - "addq $0x4, %%rax \n\t" \ - convKernel##wsize##x16c4(%%rax, rtype##25, off1, rtype##24, rtype) \ - "addq $"#off2", %[filter] \n\t" \ - "addq %[dilateW], %[input] \n\t" \ - "dec %%r9 \n\t" \ - "jg 1b \n\t" \ - "addq %[dilateH], %[input] \n\t" \ - "dec %%rbx \n\t" \ - "jg 0b \n\t" \ - "addq %[f4Step], %[input] \n\t" \ - ".align 16 \n\t" \ - "2: \n\t" \ - "cmpq $0x4, %%rcx \n\t" \ - "jl 5f \n\t" \ - "shr $1, %[dilateW] \n\t" \ - "shr $1, %[dilateH] \n\t" \ - "shr $1, %[stepC16] \n\t" \ - "mov %[kh], %%rbx \n\t" \ - ".align 16 \n\t" \ - "3: \n\t" \ - "mov %[kw], %%r9 \n\t" \ - ".align 16 \n\t" \ - "4: \n\t" \ - "movq %[input], %%rax \n\t" \ - convKernel##wsize##x16c4(%%rax, rtype##24, off0, rtype##25, rtype) \ - "addq $"#off1", %[filter] \n\t" \ - "addq %[dilateW], %[input] \n\t" \ - "dec %%r9 \n\t" \ - "jg 4b \n\t" \ - "addq %[dilateH], %[input] \n\t" \ - "dec %%rbx \n\t" \ - "jg 3b \n\t" \ - ".align 16 \n\t" \ - "5: \n\t" \ - : "+c" (c.ic) \ - : [input] "r" (c.input), [filter] "r" (c.filter), [bias] "r" (c.bias), [kh] "r" (c.kh), [kw] "r" (c.kw), \ - [stepC16] "r" (c.stepC16), [dilateW] "r" (c.dilateW), \ - [dilateH] "r" (c.dilateH), [fStep] "r" (c.fStep), \ - [f4Step] "r" (c.f4Step) \ - : "%rax", "%rbx", "%r9", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", \ - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", \ - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", \ - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", \ - "%zmm31", "memory", "cc"); \ +#define convKernelForLoopXx16(rnum, wsize, rtype, off0, off1, off2, off3, off4, cross) \ + __asm__ __volatile__("vmovups (%[filter]), "#rtype"24 \n\t" \ + "addq $"#off1", %[filter] \n\t" \ + "mov $1, %%eax \n\t" \ + "vmovd %%eax, %%xmm0 \n\t" \ + "vpbroadcastw %%xmm0, "#rtype"31 \n\t" \ + "movq %[flags], %%rax \n\t" \ + "andq $0x1, %%rax \n\t" \ + "jne 0f \n\t" \ + load16BiasTo##rnum##Regs(%[bias], rtype) \ + "jmp 1f \n\t" \ + ".align 16 \n\t" \ + "0: \n\t" \ + clear##rnum##Regs(rtype) \ + ".align 16 \n\t" \ + "1: \n\t" \ + : [filter] "+r" (c.filter) \ + : [bias] "r" (c.bias), \ + [flags] "r" (c.flags) \ + : "%rax", \ + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", \ + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", \ + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \ + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \ + "memory", "cc"); \ + if (c.ic >= 16) { \ + __asm__ __volatile__("movq (%[stepC16]), %%r10 \n\t" \ + ".align 16 \n\t" \ + "1: \n\t" \ + "mov %[kh], %%rbx \n\t" \ + ".align 16 \n\t" \ + "2: \n\t" \ + "mov %[kw], %%r9 \n\t" \ + ".align 16 \n\t" \ + "3: \n\t" \ + "movq %[input], %%rax \n\t" \ + convKernel##wsize##x16c4_##cross( \ + %%rax, rtype##24, off0, rtype##25, rtype) \ + "movq %[input], %%rax \n\t" \ + "addq $0x4, %%rax \n\t" \ + convKernel##wsize##x16c4_##cross( \ + %%rax, rtype##25, off1, rtype##24, rtype) \ + "movq %[input], %%rax \n\t" \ + "addq $0x8, %%rax \n\t" \ + convKernel##wsize##x16c4_##cross( \ + %%rax, rtype##24, off2, rtype##25, rtype) \ + "movq %[input], %%rax \n\t" \ + "addq $0xC, %%rax \n\t" \ + convKernel##wsize##x16c4_##cross( \ + %%rax, rtype##25, off3, rtype##24, rtype) \ + "addq $"#off4", %[filter] \n\t" \ + "addq %[dilateW], %[input] \n\t" \ + "dec %%r9 \n\t" \ + "jg 3b \n\t" \ + "addq %[dilateH], %[input] \n\t" \ + "dec %%rbx \n\t" \ + "jg 2b \n\t" \ + "addq %[fStep], %[input] \n\t" \ + "subq $0x10, %%rcx \n\t" \ + "cmpq $0x10, %%rcx \n\t" \ + "jge 1b \n\t" \ + "subq %[fStep], %[input] \n\t" \ + "addq %[f8Step], %[input] \n\t" \ + ".align 16 \n\t" \ + "4: \n\t" \ + : "+c" (c.ic), \ + [input] "+r" (c.input), \ + [filter] "+r" (c.filter) \ + : [bias] "r" (c.bias), \ + [kh] "r" (c.kh), \ + [kw] "r" (c.kw), \ + [stepC16] "r" (c.stepC16), \ + [dilateW] "r" (c.dilateW), \ + [dilateH] "r" (c.dilateH), \ + [fStep] "r" (c.fStep), \ + [f8Step] "r" (c.f8Step) \ + : "%rax", "%rbx", "%r9", "%r10", \ + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", \ + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", \ + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \ + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \ + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", \ + "%zmm30", "%zmm31", "memory", "cc"); \ + } \ + if (c.ic > 0) { \ + __asm__ __volatile__("cmpq $0x8, %%rcx \n\t" \ + "jl 2f \n\t" \ + "subq $0x8, %%rcx \n\t" \ + "shr $1, %[dilateW] \n\t" \ + "shr $1, %[dilateH] \n\t" \ + "shr $1, %[fStep] \n\t" \ + "addq $192, %[stepC16] \n\t" \ + "mov %[kh], %%rbx \n\t" \ + ".align 16 \n\t" \ + "0: \n\t" \ + "mov %[kw], %%r9 \n\t" \ + ".align 16 \n\t" \ + "1: \n\t" \ + "movq %[input], %%rax \n\t" \ + convKernel##wsize##x16c4_##cross( \ + %%rax, rtype##24, off0, rtype##25, rtype) \ + "movq %[input], %%rax \n\t" \ + "addq $0x4, %%rax \n\t" \ + convKernel##wsize##x16c4_##cross( \ + %%rax, rtype##25, off1, rtype##24, rtype) \ + "addq $"#off2", %[filter] \n\t" \ + "addq %[dilateW], %[input] \n\t" \ + "dec %%r9 \n\t" \ + "jg 1b \n\t" \ + "addq %[dilateH], %[input] \n\t" \ + "dec %%rbx \n\t" \ + "jg 0b \n\t" \ + "addq %[f4Step], %[input] \n\t" \ + ".align 16 \n\t" \ + "2: \n\t" \ + "cmpq $0x4, %%rcx \n\t" \ + "jl 5f \n\t" \ + "shr $1, %[dilateW] \n\t" \ + "shr $1, %[dilateH] \n\t" \ + "addq $192, %[stepC16] \n\t" \ + "mov %[kh], %%rbx \n\t" \ + ".align 16 \n\t" \ + "3: \n\t" \ + "mov %[kw], %%r9 \n\t" \ + ".align 16 \n\t" \ + "4: \n\t" \ + "movq %[input], %%rax \n\t" \ + convKernel##wsize##x16c4_##cross( \ + %%rax, rtype##24, off0, rtype##25, rtype) \ + "addq $"#off1", %[filter] \n\t" \ + "addq %[dilateW], %[input] \n\t" \ + "dec %%r9 \n\t" \ + "jg 4b \n\t" \ + "addq %[dilateH], %[input] \n\t" \ + "dec %%rbx \n\t" \ + "jg 3b \n\t" \ + ".align 16 \n\t" \ + "5: \n\t" \ + : "+c" (c.ic) \ + : [input] "r" (c.input), \ + [filter] "r" (c.filter), \ + [bias] "r" (c.bias), \ + [kh] "r" (c.kh), \ + [kw] "r" (c.kw), \ + [stepC16] "r" (c.stepC16), \ + [dilateW] "r" (c.dilateW), \ + [dilateH] "r" (c.dilateH), \ + [fStep] "r" (c.fStep), \ + [f4Step] "r" (c.f4Step) \ + : "%rax", "%rbx", "%r9", "%r10", \ + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", \ + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", \ + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", \ + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", \ + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", \ + "%zmm30", "%zmm31", "memory", "cc"); \ } void Avx512ConvKernel24x16(ConvController &c) { - convKernelForLoopXx16(24, 24, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100) - - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" - "je 0f \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd 0x40(%%rax), %%zmm1, %%zmm1 \n\t" - "vpaddd 0x80(%%rax), %%zmm2, %%zmm2 \n\t" - "vpaddd 0xC0(%%rax), %%zmm3, %%zmm3 \n\t" - "vpaddd 0x100(%%rax), %%zmm4, %%zmm4 \n\t" - "vpaddd 0x140(%%rax), %%zmm5, %%zmm5 \n\t" - "vpaddd 0x180(%%rax), %%zmm6, %%zmm6 \n\t" - "vpaddd 0x1C0(%%rax), %%zmm7, %%zmm7 \n\t" - "vpaddd 0x200(%%rax), %%zmm8, %%zmm8 \n\t" - "vpaddd 0x240(%%rax), %%zmm9, %%zmm9 \n\t" - "vpaddd 0x280(%%rax), %%zmm10, %%zmm10 \n\t" - "vpaddd 0x2C0(%%rax), %%zmm11, %%zmm11 \n\t" - "vpaddd 0x300(%%rax), %%zmm12, %%zmm12 \n\t" - "vpaddd 0x340(%%rax), %%zmm13, %%zmm13 \n\t" - "vpaddd 0x380(%%rax), %%zmm14, %%zmm14 \n\t" - "vpaddd 0x3C0(%%rax), %%zmm15, %%zmm15 \n\t" - "vpaddd 0x400(%%rax), %%zmm16, %%zmm16 \n\t" - "vpaddd 0x440(%%rax), %%zmm17, %%zmm17 \n\t" - "vpaddd 0x480(%%rax), %%zmm18, %%zmm18 \n\t" - "vpaddd 0x4C0(%%rax), %%zmm19, %%zmm19 \n\t" - "vpaddd 0x500(%%rax), %%zmm20, %%zmm20 \n\t" - "vpaddd 0x540(%%rax), %%zmm21, %%zmm21 \n\t" - "vpaddd 0x580(%%rax), %%zmm22, %%zmm22 \n\t" - "vpaddd 0x5C0(%%rax), %%zmm23, %%zmm23 \n\t" + if (c.cross) { + convKernelForLoopXx16(24, 24, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100, 1) + } else { + convKernelForLoopXx16(24, 24, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100, 0) + } - ".align 16 \n\t" - "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" + "je 0f \n\t" + "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" + "vpaddd 0x40(%%rax), %%zmm1, %%zmm1 \n\t" + "vpaddd 0x80(%%rax), %%zmm2, %%zmm2 \n\t" + "vpaddd 0xC0(%%rax), %%zmm3, %%zmm3 \n\t" + "vpaddd 0x100(%%rax), %%zmm4, %%zmm4 \n\t" + "vpaddd 0x140(%%rax), %%zmm5, %%zmm5 \n\t" + "vpaddd 0x180(%%rax), %%zmm6, %%zmm6 \n\t" + "vpaddd 0x1C0(%%rax), %%zmm7, %%zmm7 \n\t" + "vpaddd 0x200(%%rax), %%zmm8, %%zmm8 \n\t" + "vpaddd 0x240(%%rax), %%zmm9, %%zmm9 \n\t" + "vpaddd 0x280(%%rax), %%zmm10, %%zmm10 \n\t" + "vpaddd 0x2C0(%%rax), %%zmm11, %%zmm11 \n\t" + "vpaddd 0x300(%%rax), %%zmm12, %%zmm12 \n\t" + "vpaddd 0x340(%%rax), %%zmm13, %%zmm13 \n\t" + "vpaddd 0x380(%%rax), %%zmm14, %%zmm14 \n\t" + "vpaddd 0x3C0(%%rax), %%zmm15, %%zmm15 \n\t" + "vpaddd 0x400(%%rax), %%zmm16, %%zmm16 \n\t" + "vpaddd 0x440(%%rax), %%zmm17, %%zmm17 \n\t" + "vpaddd 0x480(%%rax), %%zmm18, %%zmm18 \n\t" + "vpaddd 0x4C0(%%rax), %%zmm19, %%zmm19 \n\t" + "vpaddd 0x500(%%rax), %%zmm20, %%zmm20 \n\t" + "vpaddd 0x540(%%rax), %%zmm21, %%zmm21 \n\t" + "vpaddd 0x580(%%rax), %%zmm22, %%zmm22 \n\t" + "vpaddd 0x5C0(%%rax), %%zmm23, %%zmm23 \n\t" + + ".align 16 \n\t" + "0: \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" relu24Regs(%%zmm) + "jmp 4f \n\t" - ".align 16 \n\t" - "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" + ".align 16 \n\t" + "1: \n\t" + "cmpq $0x0, %[scale] \n\t" + "je 3f \n\t" convert24RegsI32ToF32(%[scale], %%zmm) - ".align 16 \n\t" - "2: \n\t" - "vmovups %%zmm0, (%%rax) \n\t" - "vmovups %%zmm1, 0x40(%%rax) \n\t" - "vmovups %%zmm2, 0x80(%%rax) \n\t" - "vmovups %%zmm3, 0xC0(%%rax) \n\t" - "vmovups %%zmm4, 0x100(%%rax) \n\t" - "vmovups %%zmm5, 0x140(%%rax) \n\t" - "vmovups %%zmm6, 0x180(%%rax) \n\t" - "vmovups %%zmm7, 0x1C0(%%rax) \n\t" - "vmovups %%zmm8, 0x200(%%rax) \n\t" - "vmovups %%zmm9, 0x240(%%rax) \n\t" - "vmovups %%zmm10, 0x280(%%rax) \n\t" - "vmovups %%zmm11, 0x2C0(%%rax) \n\t" - "vmovups %%zmm12, 0x300(%%rax) \n\t" - "vmovups %%zmm13, 0x340(%%rax) \n\t" - "vmovups %%zmm14, 0x380(%%rax) \n\t" - "vmovups %%zmm15, 0x3C0(%%rax) \n\t" - "vmovups %%zmm16, 0x400(%%rax) \n\t" - "vmovups %%zmm17, 0x440(%%rax) \n\t" - "vmovups %%zmm18, 0x480(%%rax) \n\t" - "vmovups %%zmm19, 0x4C0(%%rax) \n\t" - "vmovups %%zmm20, 0x500(%%rax) \n\t" - "vmovups %%zmm21, 0x540(%%rax) \n\t" - "vmovups %%zmm22, 0x580(%%rax) \n\t" - "vmovups %%zmm23, 0x5C0(%%rax) \n\t" + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + "vaddps 0x40(%[eltwise]), %%zmm1, %%zmm1 \n\t" + "vaddps 0x80(%[eltwise]), %%zmm2, %%zmm2 \n\t" + "vaddps 0xC0(%[eltwise]), %%zmm3, %%zmm3 \n\t" + "vaddps 0x100(%[eltwise]), %%zmm4, %%zmm4 \n\t" + "vaddps 0x140(%[eltwise]), %%zmm5, %%zmm5 \n\t" + "vaddps 0x180(%[eltwise]), %%zmm6, %%zmm6 \n\t" + "vaddps 0x1C0(%[eltwise]), %%zmm7, %%zmm7 \n\t" + "vaddps 0x200(%[eltwise]), %%zmm8, %%zmm8 \n\t" + "vaddps 0x240(%[eltwise]), %%zmm9, %%zmm9 \n\t" + "vaddps 0x280(%[eltwise]), %%zmm10, %%zmm10 \n\t" + "vaddps 0x2C0(%[eltwise]), %%zmm11, %%zmm11 \n\t" + "vaddps 0x300(%[eltwise]), %%zmm12, %%zmm12 \n\t" + "vaddps 0x340(%[eltwise]), %%zmm13, %%zmm13 \n\t" + "vaddps 0x380(%[eltwise]), %%zmm14, %%zmm14 \n\t" + "vaddps 0x3C0(%[eltwise]), %%zmm15, %%zmm15 \n\t" + "vaddps 0x400(%[eltwise]), %%zmm16, %%zmm16 \n\t" + "vaddps 0x440(%[eltwise]), %%zmm17, %%zmm17 \n\t" + "vaddps 0x480(%[eltwise]), %%zmm18, %%zmm18 \n\t" + "vaddps 0x4C0(%[eltwise]), %%zmm19, %%zmm19 \n\t" + "vaddps 0x500(%[eltwise]), %%zmm20, %%zmm20 \n\t" + "vaddps 0x540(%[eltwise]), %%zmm21, %%zmm21 \n\t" + "vaddps 0x580(%[eltwise]), %%zmm22, %%zmm22 \n\t" + "vaddps 0x5C0(%[eltwise]), %%zmm23, %%zmm23 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + relu24RegsPs(%%zmm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%zmm0, (%%rax) \n\t" + "vmovups %%zmm1, 0x40(%%rax) \n\t" + "vmovups %%zmm2, 0x80(%%rax) \n\t" + "vmovups %%zmm3, 0xC0(%%rax) \n\t" + "vmovups %%zmm4, 0x100(%%rax) \n\t" + "vmovups %%zmm5, 0x140(%%rax) \n\t" + "vmovups %%zmm6, 0x180(%%rax) \n\t" + "vmovups %%zmm7, 0x1C0(%%rax) \n\t" + "vmovups %%zmm8, 0x200(%%rax) \n\t" + "vmovups %%zmm9, 0x240(%%rax) \n\t" + "vmovups %%zmm10, 0x280(%%rax) \n\t" + "vmovups %%zmm11, 0x2C0(%%rax) \n\t" + "vmovups %%zmm12, 0x300(%%rax) \n\t" + "vmovups %%zmm13, 0x340(%%rax) \n\t" + "vmovups %%zmm14, 0x380(%%rax) \n\t" + "vmovups %%zmm15, 0x3C0(%%rax) \n\t" + "vmovups %%zmm16, 0x400(%%rax) \n\t" + "vmovups %%zmm17, 0x440(%%rax) \n\t" + "vmovups %%zmm18, 0x480(%%rax) \n\t" + "vmovups %%zmm19, 0x4C0(%%rax) \n\t" + "vmovups %%zmm20, 0x500(%%rax) \n\t" + "vmovups %%zmm21, 0x540(%%rax) \n\t" + "vmovups %%zmm22, 0x580(%%rax) \n\t" + "vmovups %%zmm23, 0x5C0(%%rax) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [ostepC16] "r" (c.ostepC16), + [eltwise] "r" (c.eltwise), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", + "%zmm12", "%zmm13", "%zmm14", "%zmm15", "%zmm16", "%zmm17", + "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", "%zmm23", + "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", + "%zmm30", "%zmm31", "memory", "cc"); } void Avx512ConvKernel12x16(ConvController &c) { - convKernelForLoopXx16(12, 12, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100) - - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" - "je 0f \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd 0x40(%%rax), %%zmm1, %%zmm1 \n\t" - "vpaddd 0x80(%%rax), %%zmm2, %%zmm2 \n\t" - "vpaddd 0xC0(%%rax), %%zmm3, %%zmm3 \n\t" - "vpaddd 0x100(%%rax), %%zmm4, %%zmm4 \n\t" - "vpaddd 0x140(%%rax), %%zmm5, %%zmm5 \n\t" - "vpaddd 0x180(%%rax), %%zmm6, %%zmm6 \n\t" - "vpaddd 0x1C0(%%rax), %%zmm7, %%zmm7 \n\t" - "vpaddd 0x200(%%rax), %%zmm8, %%zmm8 \n\t" - "vpaddd 0x240(%%rax), %%zmm9, %%zmm9 \n\t" - "vpaddd 0x280(%%rax), %%zmm10, %%zmm10 \n\t" - "vpaddd 0x2C0(%%rax), %%zmm11, %%zmm11 \n\t" + if (c.cross) { + convKernelForLoopXx16(12, 12, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100, 1) + } else { + convKernelForLoopXx16(12, 12, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100, 0) + } - ".align 16 \n\t" - "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" + "je 0f \n\t" + "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" + "vpaddd 0x40(%%rax), %%zmm1, %%zmm1 \n\t" + "vpaddd 0x80(%%rax), %%zmm2, %%zmm2 \n\t" + "vpaddd 0xC0(%%rax), %%zmm3, %%zmm3 \n\t" + "vpaddd 0x100(%%rax), %%zmm4, %%zmm4 \n\t" + "vpaddd 0x140(%%rax), %%zmm5, %%zmm5 \n\t" + "vpaddd 0x180(%%rax), %%zmm6, %%zmm6 \n\t" + "vpaddd 0x1C0(%%rax), %%zmm7, %%zmm7 \n\t" + "vpaddd 0x200(%%rax), %%zmm8, %%zmm8 \n\t" + "vpaddd 0x240(%%rax), %%zmm9, %%zmm9 \n\t" + "vpaddd 0x280(%%rax), %%zmm10, %%zmm10 \n\t" + "vpaddd 0x2C0(%%rax), %%zmm11, %%zmm11 \n\t" + + ".align 16 \n\t" + "0: \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" relu12Regs(%%zmm) + "jmp 4f \n\t" - ".align 16 \n\t" - "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" + ".align 16 \n\t" + "1: \n\t" convert12RegsI32ToF32(%[scale], %%zmm) - ".align 16 \n\t" - "2: \n\t" - "vmovups %%zmm0, (%%rax) \n\t" - "vmovups %%zmm1, 0x40(%%rax) \n\t" - "vmovups %%zmm2, 0x80(%%rax) \n\t" - "vmovups %%zmm3, 0xC0(%%rax) \n\t" - "vmovups %%zmm4, 0x100(%%rax) \n\t" - "vmovups %%zmm5, 0x140(%%rax) \n\t" - "vmovups %%zmm6, 0x180(%%rax) \n\t" - "vmovups %%zmm7, 0x1C0(%%rax) \n\t" - "vmovups %%zmm8, 0x200(%%rax) \n\t" - "vmovups %%zmm9, 0x240(%%rax) \n\t" - "vmovups %%zmm10, 0x280(%%rax) \n\t" - "vmovups %%zmm11, 0x2C0(%%rax) \n\t" + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + "vaddps 0x40(%[eltwise]), %%zmm1, %%zmm1 \n\t" + "vaddps 0x80(%[eltwise]), %%zmm2, %%zmm2 \n\t" + "vaddps 0xC0(%[eltwise]), %%zmm3, %%zmm3 \n\t" + "vaddps 0x100(%[eltwise]), %%zmm4, %%zmm4 \n\t" + "vaddps 0x140(%[eltwise]), %%zmm5, %%zmm5 \n\t" + "vaddps 0x180(%[eltwise]), %%zmm6, %%zmm6 \n\t" + "vaddps 0x1C0(%[eltwise]), %%zmm7, %%zmm7 \n\t" + "vaddps 0x200(%[eltwise]), %%zmm8, %%zmm8 \n\t" + "vaddps 0x240(%[eltwise]), %%zmm9, %%zmm9 \n\t" + "vaddps 0x280(%[eltwise]), %%zmm10, %%zmm10 \n\t" + "vaddps 0x2C0(%[eltwise]), %%zmm11, %%zmm11 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + relu12RegsPs(%%zmm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%zmm0, (%%rax) \n\t" + "vmovups %%zmm1, 0x40(%%rax) \n\t" + "vmovups %%zmm2, 0x80(%%rax) \n\t" + "vmovups %%zmm3, 0xC0(%%rax) \n\t" + "vmovups %%zmm4, 0x100(%%rax) \n\t" + "vmovups %%zmm5, 0x140(%%rax) \n\t" + "vmovups %%zmm6, 0x180(%%rax) \n\t" + "vmovups %%zmm7, 0x1C0(%%rax) \n\t" + "vmovups %%zmm8, 0x200(%%rax) \n\t" + "vmovups %%zmm9, 0x240(%%rax) \n\t" + "vmovups %%zmm10, 0x280(%%rax) \n\t" + "vmovups %%zmm11, 0x2C0(%%rax) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [ostepC16] "r" (c.ostepC16), + [eltwise] "r" (c.eltwise), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", + "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", + "%zmm24", "%zmm31", "memory", "cc"); } void Avx512ConvKernel1x16(ConvController &c) { - convKernelForLoopXx16(1, 1, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100) + if (c.cross) { + convKernelForLoopXx16(1, 1, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100, 1) + } else { + convKernelForLoopXx16(1, 1, %%zmm, 0x0, 0x40, 0x80, 0xC0, 0x100, 0) + } - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" - "je 0f \n\t" - "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" - "vpaddd 0x40(%%rax), %%zmm1, %%zmm1 \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" + "je 0f \n\t" + "vpaddd (%%rax), %%zmm0, %%zmm0 \n\t" + "vpaddd 0x40(%%rax), %%zmm1, %%zmm1 \n\t" - ".align 16 \n\t" - "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" reluReg(%%zmm) + "jmp 4f \n\t" - ".align 16 \n\t" - "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" + ".align 16 \n\t" + "1: \n\t" convertRegI32ToF32(%[scale], %%zmm) - ".align 16 \n\t" - "2: \n\t" - "vmovups %%zmm0, (%%rax) \n\t" + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + reluRegPs(%%zmm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%zmm0, (%%rax) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", - "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", - "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", - "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [ostepC16] "r" (c.ostepC16), + [eltwise] "r" (c.eltwise), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%zmm0", "%zmm24", "%zmm31", + "memory", "cc"); } void Avx512ConvKernel24x8(ConvController &c) { - convKernelForLoopXx16(24, 24, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80) - - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" - "je 0f \n\t" - "vpaddd (%%rax), %%ymm0, %%ymm0 \n\t" - "vpaddd 0x20(%%rax), %%ymm1, %%ymm1 \n\t" - "vpaddd 0x40(%%rax), %%ymm2, %%ymm2 \n\t" - "vpaddd 0x60(%%rax), %%ymm3, %%ymm3 \n\t" - "vpaddd 0x80(%%rax), %%ymm4, %%ymm4 \n\t" - "vpaddd 0xA0(%%rax), %%ymm5, %%ymm5 \n\t" - "vpaddd 0xC0(%%rax), %%ymm6, %%ymm6 \n\t" - "vpaddd 0xE0(%%rax), %%ymm7, %%ymm7 \n\t" - "vpaddd 0x100(%%rax), %%ymm8, %%ymm8 \n\t" - "vpaddd 0x120(%%rax), %%ymm9, %%ymm9 \n\t" - "vpaddd 0x140(%%rax), %%ymm10, %%ymm10 \n\t" - "vpaddd 0x160(%%rax), %%ymm11, %%ymm11 \n\t" - "vpaddd 0x180(%%rax), %%ymm12, %%ymm12 \n\t" - "vpaddd 0x1A0(%%rax), %%ymm13, %%ymm13 \n\t" - "vpaddd 0x1C0(%%rax), %%ymm14, %%ymm14 \n\t" - "vpaddd 0x1E0(%%rax), %%ymm15, %%ymm15 \n\t" - "vpaddd 0x200(%%rax), %%ymm16, %%ymm16 \n\t" - "vpaddd 0x220(%%rax), %%ymm17, %%ymm17 \n\t" - "vpaddd 0x240(%%rax), %%ymm18, %%ymm18 \n\t" - "vpaddd 0x260(%%rax), %%ymm19, %%ymm19 \n\t" - "vpaddd 0x280(%%rax), %%ymm20, %%ymm20 \n\t" - "vpaddd 0x2A0(%%rax), %%ymm21, %%ymm21 \n\t" - "vpaddd 0x2C0(%%rax), %%ymm22, %%ymm22 \n\t" - "vpaddd 0x2E0(%%rax), %%ymm23, %%ymm23 \n\t" + if (c.cross) { + convKernelForLoopXx16(24, 24, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80, 1) + } else { + convKernelForLoopXx16(24, 24, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80, 0) + } - ".align 16 \n\t" - "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" + "je 0f \n\t" + "vpaddd (%%rax), %%ymm0, %%ymm0 \n\t" + "vpaddd 0x20(%%rax), %%ymm1, %%ymm1 \n\t" + "vpaddd 0x40(%%rax), %%ymm2, %%ymm2 \n\t" + "vpaddd 0x60(%%rax), %%ymm3, %%ymm3 \n\t" + "vpaddd 0x80(%%rax), %%ymm4, %%ymm4 \n\t" + "vpaddd 0xA0(%%rax), %%ymm5, %%ymm5 \n\t" + "vpaddd 0xC0(%%rax), %%ymm6, %%ymm6 \n\t" + "vpaddd 0xE0(%%rax), %%ymm7, %%ymm7 \n\t" + "vpaddd 0x100(%%rax), %%ymm8, %%ymm8 \n\t" + "vpaddd 0x120(%%rax), %%ymm9, %%ymm9 \n\t" + "vpaddd 0x140(%%rax), %%ymm10, %%ymm10 \n\t" + "vpaddd 0x160(%%rax), %%ymm11, %%ymm11 \n\t" + "vpaddd 0x180(%%rax), %%ymm12, %%ymm12 \n\t" + "vpaddd 0x1A0(%%rax), %%ymm13, %%ymm13 \n\t" + "vpaddd 0x1C0(%%rax), %%ymm14, %%ymm14 \n\t" + "vpaddd 0x1E0(%%rax), %%ymm15, %%ymm15 \n\t" + "vpaddd 0x200(%%rax), %%ymm16, %%ymm16 \n\t" + "vpaddd 0x220(%%rax), %%ymm17, %%ymm17 \n\t" + "vpaddd 0x240(%%rax), %%ymm18, %%ymm18 \n\t" + "vpaddd 0x260(%%rax), %%ymm19, %%ymm19 \n\t" + "vpaddd 0x280(%%rax), %%ymm20, %%ymm20 \n\t" + "vpaddd 0x2A0(%%rax), %%ymm21, %%ymm21 \n\t" + "vpaddd 0x2C0(%%rax), %%ymm22, %%ymm22 \n\t" + "vpaddd 0x2E0(%%rax), %%ymm23, %%ymm23 \n\t" + + ".align 16 \n\t" + "0: \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" relu24Regs(%%ymm) + "jmp 4f \n\t" - ".align 16 \n\t" - "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" + ".align 16 \n\t" + "1: \n\t" convert24RegsI32ToF32(%[scale], %%ymm) - ".align 16 \n\t" - "2: \n\t" - "vmovups %%ymm0, (%%rax) \n\t" - "vmovups %%ymm1, 0x20(%%rax) \n\t" - "vmovups %%ymm2, 0x40(%%rax) \n\t" - "vmovups %%ymm3, 0x60(%%rax) \n\t" - "vmovups %%ymm4, 0x80(%%rax) \n\t" - "vmovups %%ymm5, 0xA0(%%rax) \n\t" - "vmovups %%ymm6, 0xC0(%%rax) \n\t" - "vmovups %%ymm7, 0xE0(%%rax) \n\t" - "vmovups %%ymm8, 0x100(%%rax) \n\t" - "vmovups %%ymm9, 0x120(%%rax) \n\t" - "vmovups %%ymm10, 0x140(%%rax) \n\t" - "vmovups %%ymm11, 0x160(%%rax) \n\t" - "vmovups %%ymm12, 0x180(%%rax) \n\t" - "vmovups %%ymm13, 0x1A0(%%rax) \n\t" - "vmovups %%ymm14, 0x1C0(%%rax) \n\t" - "vmovups %%ymm15, 0x1E0(%%rax) \n\t" - "vmovups %%ymm16, 0x200(%%rax) \n\t" - "vmovups %%ymm17, 0x220(%%rax) \n\t" - "vmovups %%ymm18, 0x240(%%rax) \n\t" - "vmovups %%ymm19, 0x260(%%rax) \n\t" - "vmovups %%ymm20, 0x280(%%rax) \n\t" - "vmovups %%ymm21, 0x2A0(%%rax) \n\t" - "vmovups %%ymm22, 0x2C0(%%rax) \n\t" - "vmovups %%ymm23, 0x2E0(%%rax) \n\t" + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%ymm0, %%ymm0 \n\t" + "vaddps 0x20(%[eltwise]), %%ymm1, %%ymm1 \n\t" + "vaddps 0x40(%[eltwise]), %%ymm2, %%ymm2 \n\t" + "vaddps 0x60(%[eltwise]), %%ymm3, %%ymm3 \n\t" + "vaddps 0x80(%[eltwise]), %%ymm4, %%ymm4 \n\t" + "vaddps 0xA0(%[eltwise]), %%ymm5, %%ymm5 \n\t" + "vaddps 0xC0(%[eltwise]), %%ymm6, %%ymm6 \n\t" + "vaddps 0xE0(%[eltwise]), %%ymm7, %%ymm7 \n\t" + "vaddps 0x100(%[eltwise]), %%ymm8, %%ymm8 \n\t" + "vaddps 0x120(%[eltwise]), %%ymm9, %%ymm9 \n\t" + "vaddps 0x140(%[eltwise]), %%ymm10, %%ymm10 \n\t" + "vaddps 0x160(%[eltwise]), %%ymm11, %%ymm11 \n\t" + "vaddps 0x180(%[eltwise]), %%ymm12, %%ymm12 \n\t" + "vaddps 0x1A0(%[eltwise]), %%ymm13, %%ymm13 \n\t" + "vaddps 0x1C0(%[eltwise]), %%ymm14, %%ymm14 \n\t" + "vaddps 0x1E0(%[eltwise]), %%ymm15, %%ymm15 \n\t" + "vaddps 0x200(%[eltwise]), %%ymm16, %%ymm16 \n\t" + "vaddps 0x220(%[eltwise]), %%ymm17, %%ymm17 \n\t" + "vaddps 0x240(%[eltwise]), %%ymm18, %%ymm18 \n\t" + "vaddps 0x260(%[eltwise]), %%ymm19, %%ymm19 \n\t" + "vaddps 0x280(%[eltwise]), %%ymm20, %%ymm20 \n\t" + "vaddps 0x2A0(%[eltwise]), %%ymm21, %%ymm21 \n\t" + "vaddps 0x2C0(%[eltwise]), %%ymm22, %%ymm22 \n\t" + "vaddps 0x2E0(%[eltwise]), %%ymm23, %%ymm23 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + relu24RegsPs(%%ymm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%ymm0, (%%rax) \n\t" + "vmovups %%ymm1, 0x20(%%rax) \n\t" + "vmovups %%ymm2, 0x40(%%rax) \n\t" + "vmovups %%ymm3, 0x60(%%rax) \n\t" + "vmovups %%ymm4, 0x80(%%rax) \n\t" + "vmovups %%ymm5, 0xA0(%%rax) \n\t" + "vmovups %%ymm6, 0xC0(%%rax) \n\t" + "vmovups %%ymm7, 0xE0(%%rax) \n\t" + "vmovups %%ymm8, 0x100(%%rax) \n\t" + "vmovups %%ymm9, 0x120(%%rax) \n\t" + "vmovups %%ymm10, 0x140(%%rax) \n\t" + "vmovups %%ymm11, 0x160(%%rax) \n\t" + "vmovups %%ymm12, 0x180(%%rax) \n\t" + "vmovups %%ymm13, 0x1A0(%%rax) \n\t" + "vmovups %%ymm14, 0x1C0(%%rax) \n\t" + "vmovups %%ymm15, 0x1E0(%%rax) \n\t" + "vmovups %%ymm16, 0x200(%%rax) \n\t" + "vmovups %%ymm17, 0x220(%%rax) \n\t" + "vmovups %%ymm18, 0x240(%%rax) \n\t" + "vmovups %%ymm19, 0x260(%%rax) \n\t" + "vmovups %%ymm20, 0x280(%%rax) \n\t" + "vmovups %%ymm21, 0x2A0(%%rax) \n\t" + "vmovups %%ymm22, 0x2C0(%%rax) \n\t" + "vmovups %%ymm23, 0x2E0(%%rax) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", - "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", - "%ymm15", "%ymm16", "%ymm17", "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22", - "%ymm23", "%ymm24", "%ymm25", "%ymm26", "%ymm27", "%ymm28", "%ymm29", "%ymm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [ostepC16] "r" (c.ostepC16), + [eltwise] "r" (c.eltwise), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", + "%ymm12", "%ymm13", "%ymm14", "%ymm15", "%ymm16", "%ymm17", + "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22", "%ymm23", + "%ymm24", "%ymm25", "%ymm26", "%ymm27", "%ymm28", "%ymm29", + "%ymm30", "%ymm31", "memory", "cc"); } void Avx512ConvKernel12x8(ConvController &c) { - convKernelForLoopXx16(12, 12, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80) - - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" - "je 0f \n\t" - "vpaddd (%%rax), %%ymm0, %%ymm0 \n\t" - "vpaddd 0x20(%%rax), %%ymm1, %%ymm1 \n\t" - "vpaddd 0x40(%%rax), %%ymm2, %%ymm2 \n\t" - "vpaddd 0x60(%%rax), %%ymm3, %%ymm3 \n\t" - "vpaddd 0x80(%%rax), %%ymm4, %%ymm4 \n\t" - "vpaddd 0xA0(%%rax), %%ymm5, %%ymm5 \n\t" - "vpaddd 0xC0(%%rax), %%ymm6, %%ymm6 \n\t" - "vpaddd 0xE0(%%rax), %%ymm7, %%ymm7 \n\t" - "vpaddd 0x100(%%rax), %%ymm8, %%ymm8 \n\t" - "vpaddd 0x120(%%rax), %%ymm9, %%ymm9 \n\t" - "vpaddd 0x140(%%rax), %%ymm10, %%ymm10 \n\t" - "vpaddd 0x160(%%rax), %%ymm11, %%ymm11 \n\t" + if (c.cross) { + convKernelForLoopXx16(12, 12, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80, 1) + } else { + convKernelForLoopXx16(12, 12, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80, 0) + } - ".align 16 \n\t" - "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" + "je 0f \n\t" + "vpaddd (%%rax), %%ymm0, %%ymm0 \n\t" + "vpaddd 0x20(%%rax), %%ymm1, %%ymm1 \n\t" + "vpaddd 0x40(%%rax), %%ymm2, %%ymm2 \n\t" + "vpaddd 0x60(%%rax), %%ymm3, %%ymm3 \n\t" + "vpaddd 0x80(%%rax), %%ymm4, %%ymm4 \n\t" + "vpaddd 0xA0(%%rax), %%ymm5, %%ymm5 \n\t" + "vpaddd 0xC0(%%rax), %%ymm6, %%ymm6 \n\t" + "vpaddd 0xE0(%%rax), %%ymm7, %%ymm7 \n\t" + "vpaddd 0x100(%%rax), %%ymm8, %%ymm8 \n\t" + "vpaddd 0x120(%%rax), %%ymm9, %%ymm9 \n\t" + "vpaddd 0x140(%%rax), %%ymm10, %%ymm10 \n\t" + "vpaddd 0x160(%%rax), %%ymm11, %%ymm11 \n\t" + + ".align 16 \n\t" + "0: \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" relu12Regs(%%ymm) + "jmp 4f \n\t" - ".align 16 \n\t" - "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" + ".align 16 \n\t" + "1: \n\t" convert12RegsI32ToF32(%[scale], %%ymm) - ".align 16 \n\t" - "2: \n\t" - "vmovups %%ymm0, (%%rax) \n\t" - "vmovups %%ymm1, 0x20(%%rax) \n\t" - "vmovups %%ymm2, 0x40(%%rax) \n\t" - "vmovups %%ymm3, 0x60(%%rax) \n\t" - "vmovups %%ymm4, 0x80(%%rax) \n\t" - "vmovups %%ymm5, 0xA0(%%rax) \n\t" - "vmovups %%ymm6, 0xC0(%%rax) \n\t" - "vmovups %%ymm7, 0xE0(%%rax) \n\t" - "vmovups %%ymm8, 0x100(%%rax) \n\t" - "vmovups %%ymm9, 0x120(%%rax) \n\t" - "vmovups %%ymm10, 0x140(%%rax) \n\t" - "vmovups %%ymm11, 0x160(%%rax) \n\t" + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%ymm0, %%ymm0 \n\t" + "vaddps 0x20(%[eltwise]), %%ymm1, %%ymm1 \n\t" + "vaddps 0x40(%[eltwise]), %%ymm2, %%ymm2 \n\t" + "vaddps 0x60(%[eltwise]), %%ymm3, %%ymm3 \n\t" + "vaddps 0x80(%[eltwise]), %%ymm4, %%ymm4 \n\t" + "vaddps 0xA0(%[eltwise]), %%ymm5, %%ymm5 \n\t" + "vaddps 0xC0(%[eltwise]), %%ymm6, %%ymm6 \n\t" + "vaddps 0xE0(%[eltwise]), %%ymm7, %%ymm7 \n\t" + "vaddps 0x100(%[eltwise]), %%ymm8, %%ymm8 \n\t" + "vaddps 0x120(%[eltwise]), %%ymm9, %%ymm9 \n\t" + "vaddps 0x140(%[eltwise]), %%ymm10, %%ymm10 \n\t" + "vaddps 0x160(%[eltwise]), %%ymm11, %%ymm11 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + relu24RegsPs(%%ymm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%ymm0, (%%rax) \n\t" + "vmovups %%ymm1, 0x20(%%rax) \n\t" + "vmovups %%ymm2, 0x40(%%rax) \n\t" + "vmovups %%ymm3, 0x60(%%rax) \n\t" + "vmovups %%ymm4, 0x80(%%rax) \n\t" + "vmovups %%ymm5, 0xA0(%%rax) \n\t" + "vmovups %%ymm6, 0xC0(%%rax) \n\t" + "vmovups %%ymm7, 0xE0(%%rax) \n\t" + "vmovups %%ymm8, 0x100(%%rax) \n\t" + "vmovups %%ymm9, 0x120(%%rax) \n\t" + "vmovups %%ymm10, 0x140(%%rax) \n\t" + "vmovups %%ymm11, 0x160(%%rax) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", - "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", - "%ymm15", "%ymm16", "%ymm17", "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22", - "%ymm23", "%ymm24", "%ymm25", "%ymm26", "%ymm27", "%ymm28", "%ymm29", "%ymm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [ostepC16] "r" (c.ostepC16), + [eltwise] "r" (c.eltwise), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", + "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", + "%ymm24","%ymm31", "memory", "cc"); } void Avx512ConvKernel1x8(ConvController &c) { - convKernelForLoopXx16(1, 1, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80) + if (c.cross) { + convKernelForLoopXx16(1, 1, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80, 1) + } else { + convKernelForLoopXx16(1, 1, %%ymm, 0x0, 0x20, 0x40, 0x60, 0x80, 0) + } - __asm__ __volatile__("movq %[output], %%rax \n\t" - "movq %[ostepC16], %%rbx \n\t" - "movq %[flags], %%rcx \n\t" - "and $0x1, %%rcx \n\t" - "je 0f \n\t" - "vpaddd (%%rax), %%ymm0, %%ymm0 \n\t" + __asm__ __volatile__("movq %[output], %%rax \n\t" + "movq %[ostepC16], %%rbx \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x1, %%rcx \n\t" + "je 0f \n\t" + "vpaddd (%%rax), %%ymm0, %%ymm0 \n\t" - ".align 16 \n\t" - "0: \n\t" - "movq %[flags], %%rcx \n\t" - "and $0xC, %%rcx \n\t" - "je 1f \n\t" + ".align 16 \n\t" + "0: \n\t" + "cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" reluReg(%%ymm) + "jmp 4f \n\t" - ".align 16 \n\t" - "1: \n\t" - "cmpq $0x0, %[scale] \n\t" - "je 2f \n\t" + ".align 16 \n\t" + "1: \n\t" convertRegI32ToF32(%[scale], %%ymm) - ".align 16 \n\t" - "2: \n\t" - "vmovups %%ymm0, (%%rax) \n\t" + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%ymm0, %%ymm0 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + reluRegPs(%%ymm) + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%ymm0, (%%rax) \n\t" : - : [output] "r" (c.output), [ostepC16] "r" (c.ostepC16), [flags] "r" (c.flags), [scale] "r" (c.scale) - : "%rax", "%rbx", "%rcx", "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", - "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", - "%ymm15", "%ymm16", "%ymm17", "%ymm18", "%ymm19", "%ymm20", "%ymm21", "%ymm22", - "%ymm23", "%ymm24", "%ymm25", "%ymm26", "%ymm27", "%ymm28", "%ymm29", "%ymm30", - "%zmm31", "memory", "cc"); + : [output] "r" (c.output), + [ostepC16] "r" (c.ostepC16), + [eltwise] "r" (c.eltwise), + [flags] "r" (c.flags), + [scale] "r" (c.scale) + : "%rax", "%rbx", "%rcx", + "%ymm0", "%ymm24", "%ymm31", + "memory", "cc"); } // clang-format on EE convolution_direct(TensorDesc inputDesc, UINT8 *inArray, + F32 *eltwiseInput, TensorDesc filterDesc, const INT8 *filterArray, ConvolutionParamSpec convParamSpec, TensorDesc biasDesc, - const I32 *biasArray, + const F32 *biasArray, U32 tmpBytes, void *tmp, TensorDesc outputDesc, @@ -2227,10 +3345,10 @@ EE convolution_direct(TensorDesc inputDesc, // get computing params U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 dilateH = convParamSpec.dilatedRate_h; U32 dilateW = convParamSpec.dilatedRate_w; U32 ih_pad = ih + paddingT + paddingB; @@ -2246,7 +3364,7 @@ EE convolution_direct(TensorDesc inputDesc, convCtl.dilateW = dilateW * SIMDW; convCtl.dilateH = (iw_pad - fw * dilateW + (dilateH - 1) * iw_pad) * SIMDW; convCtl.fStep = ((ih_pad - fh * dilateH) * iw_pad) * SIMDW; - convCtl.stepC16 = strideW * 16; + // convCtl.stepC16 = strideW * 16; convCtl.kw = fw; convCtl.kh = fh; convCtl.scale = nullptr; @@ -2273,9 +3391,12 @@ EE convolution_direct(TensorDesc inputDesc, tmp = (void *)((U8 *)tmp + tensorNumElements(outputDesc) * bytesOf(DT_I32)); outputDesc.dt = DT_I32; } + if (eltwiseInput != nullptr) { + outputDesc.dt = DT_F32; + } F32 *factorPtr = nullptr; F32 factor = 0; - if (scale != nullptr && odt == DT_F32) { + if (scale != nullptr && outputDesc.dt == DT_F32) { factor = 1 / (*scaleO); factorPtr = &factor; } @@ -2288,6 +3409,15 @@ EE convolution_direct(TensorDesc inputDesc, U32 oBytes = bytesOf(outputDesc.dt); UINT8 *tmpInput = (UINT8 *)tmp; + I64 step[72]; + I64 normalStep = strideW * 16; + I64 lastStep = (iw_pad - (ow - 1) * strideW + (strideH - 1) * iw_pad) * 16; + for (U32 i = 0; i < 24; ++i) { + step[i] = strideW * 16; + step[i + 24] = strideW * 8; + step[i + 48] = strideW * 4; + } + convCtl.stepC16 = step; for (U32 n = 0; n < in; ++n) { UINT8 *bInArray = inArray + n * ic * ih * iw; if (idf == DF_NCHWC16 && paddingT == 0 && paddingB == 0 && paddingL == 0 && paddingR == 0) { @@ -2306,6 +3436,7 @@ EE convolution_direct(TensorDesc inputDesc, icSize = UNI_MIN(BLOCK_IC_DIM, ic - icbb); flags |= (icbb > 0); if (icbb == ic - icSize) { + flags |= (eltwiseInput != nullptr) << 1; flags |= U32(activationDesc.mode) << 2; convCtl.scale = factorPtr; } @@ -2315,7 +3446,10 @@ EE convolution_direct(TensorDesc inputDesc, if (icSize < SIMDW) { simdC = icSizeArray[icSize >> 3]; } - for (U32 h = 0; h < oh; ++h) { + + U32 hwSize = 0; + for (U32 hw = 0; hw < oh * ow; hw += hwSize) { + hwSize = UNI_MIN(BLOCK_HW_DIM, oh * ow - hw); U32 ocSize = 0; for (U32 ocb = 0; ocb < oc; ocb += ocSize) { ocSize = UNI_MIN(unrollOc, oc - ocb); @@ -2325,23 +3459,43 @@ EE convolution_direct(TensorDesc inputDesc, UINT8 *curI = tmpInput + icbb * ih_pad * iw_pad; U32 wSize = 8; U32 unrollW = wSizeArray[ocSize >> 4]; - for (U32 w = 0; w < ow; w += wSize) { - wSize = UNI_MIN(ow - w, unrollW); + for (U32 ihw = hw; ihw < hw + hwSize; ihw += wSize) { + wSize = UNI_MIN(hw + hwSize - ihw, unrollW); U32 idx = wSize * 2 / unrollW; wSize = UNI_MAX(idx * unrollW / 2, 1); - U32 in_h = h * strideH; - U32 in_w = w * strideW; + U32 in_h = ihw / ow * strideH; + U32 in_w = ihw % ow * strideW; convCtl.input = curI + in_h * iw_pad * simdC + in_w * simdC; - convCtl.output = - output + ((n * oc + ocb) * ohow + (h * ow + w) * simdOc) * oBytes; + convCtl.output = output + ((n * oc + ocb) * ohow + ihw * simdOc) * oBytes; + convCtl.eltwise = eltwiseInput + (n * oc + ocb) * ohow + ihw * simdOc; convCtl.filter = filterArray + ocb * ic * fh * fw + ocSize * icbb * fh * fw; if ((ic % 16 != 0) && (icbb == (int)ic - icSize)) { U32 cx = (ic % 8 == 0) ? 8 : 4; convCtl.f8Step = convCtl.fStep - (in_h * iw_pad + in_w) * (SIMDW - cx); convCtl.f4Step = convCtl.fStep / 2 - (in_h * iw_pad + in_w) * (8 - 4); } + convCtl.cross = false; + if ((ihw % ow + wSize) > ow) { + U32 lane = (ihw % ow + wSize) / ow; + if ((ihw % ow + wSize) % ow == 0) { + --lane; + } + for (U32 ui = 0; ui < lane; ++ui) { + convCtl.stepC16[(ihw / ow + ui + 1) * ow - ihw - 1] = lastStep; + } + convCtl.cross = true; + } convCtl.ic = icSize; kernel[ocSize >> 4][idx](convCtl); + if ((ihw % ow + wSize) > ow) { + U32 lane = (ihw % ow + wSize) / ow; + if ((ihw % ow + wSize) % ow == 0) { + --lane; + } + for (U32 ui = 0; ui < lane; ++ui) { + convCtl.stepC16[(ihw / ow + ui + 1) * ow - ihw - 1] = normalStep; + } + } } } } @@ -2353,6 +3507,7 @@ EE convolution_direct(TensorDesc inputDesc, F32 scales[2] = {-1, scaleO[0]}; TensorDesc qDesc = outputDesc; qDesc.dt = DT_U8_Q; + I32 *oi = (I32 *)output; CHECK_STATUS(quantize_x86(outputDesc, (void *)output, &qDesc, (void *)outArray, scales)); *scaleO = scales[0]; } diff --git a/compute/tensor/src/cpu/x86/int8/convolution_functions.h b/compute/tensor/src/cpu/x86/int8/convolution_functions.h new file mode 100644 index 00000000..20409f95 --- /dev/null +++ b/compute/tensor/src/cpu/x86/int8/convolution_functions.h @@ -0,0 +1,324 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +struct ConvController { + UINT8 *input; + const INT8 *filter; + void *output; + F32 *eltwise; + UINT8 *u8Output; + const I32 *bias; + I64 ic; + I64 kw; + I64 kh; + I64 *stepC16; + I64 dilateW; + I64 dilateH; + I64 ostepC16; + I64 flags; + I64 fStep; + I64 f8Step; + I64 f4Step; + void *scale; + bool cross; +}; + +typedef void (*kernelFunc)(ConvController &c); + +// clang-format off +#define clear1Regs(rtype) \ + "vxorps "#rtype"0, "#rtype"0, "#rtype"0 \n\t" + +#define clear2Regs(rtype) \ + clear1Regs(rtype) \ + "vxorps "#rtype"1, "#rtype"1, "#rtype"1 \n\t" + +#define clear3Regs(rtype) \ + clear2Regs(rtype) \ + "vxorps "#rtype"2, "#rtype"2, "#rtype"2 \n\t" + +#define clear12Regs(rtype) \ + clear3Regs(rtype) \ + "vxorps "#rtype"3, "#rtype"3, "#rtype"3 \n\t" \ + "vxorps "#rtype"4, "#rtype"4, "#rtype"4 \n\t" \ + "vxorps "#rtype"5, "#rtype"5, "#rtype"5 \n\t" \ + "vxorps "#rtype"6, "#rtype"6, "#rtype"6 \n\t" \ + "vxorps "#rtype"7, "#rtype"7, "#rtype"7 \n\t" \ + "vxorps "#rtype"8, "#rtype"8, "#rtype"8 \n\t" \ + "vxorps "#rtype"9, "#rtype"9, "#rtype"9 \n\t" \ + "vxorps "#rtype"10, "#rtype"10, "#rtype"10 \n\t" \ + "vxorps "#rtype"11, "#rtype"11, "#rtype"11 \n\t" + +#define clear24Regs(rtype) \ + clear12Regs(rtype) \ + "vxorps "#rtype"12, "#rtype"12, "#rtype"12 \n\t" \ + "vxorps "#rtype"13, "#rtype"13, "#rtype"13 \n\t" \ + "vxorps "#rtype"14, "#rtype"14, "#rtype"14 \n\t" \ + "vxorps "#rtype"15, "#rtype"15, "#rtype"15 \n\t" \ + "vxorps "#rtype"16, "#rtype"16, "#rtype"16 \n\t" \ + "vxorps "#rtype"17, "#rtype"17, "#rtype"17 \n\t" \ + "vxorps "#rtype"18, "#rtype"18, "#rtype"18 \n\t" \ + "vxorps "#rtype"19, "#rtype"19, "#rtype"19 \n\t" \ + "vxorps "#rtype"20, "#rtype"20, "#rtype"20 \n\t" \ + "vxorps "#rtype"21, "#rtype"21, "#rtype"21 \n\t" \ + "vxorps "#rtype"22, "#rtype"22, "#rtype"22 \n\t" \ + "vxorps "#rtype"23, "#rtype"23, "#rtype"23 \n\t" + +#define reluReg(rtype) \ + "vpxord "#rtype"31, "#rtype"31, "#rtype"31 \n\t" \ + "vpmaxsd "#rtype"31, "#rtype"0, "#rtype"0 \n\t" + +#define relu2Regs(rtype) \ + reluReg(rtype) \ + "vpmaxsd "#rtype"31, "#rtype"1, "#rtype"1 \n\t" + +#define relu3Regs(rtype) \ + relu2Regs(rtype) \ + "vpmaxsd "#rtype"31, "#rtype"2, "#rtype"2 \n\t" + +#define relu12Regs(rtype) \ + relu3Regs(rtype) \ + "vpmaxsd "#rtype"31, "#rtype"3, "#rtype"3 \n\t" \ + "vpmaxsd "#rtype"31, "#rtype"4, "#rtype"4 \n\t" \ + "vpmaxsd "#rtype"31, "#rtype"5, "#rtype"5 \n\t" \ + "vpmaxsd "#rtype"31, "#rtype"6, "#rtype"6 \n\t" \ + "vpmaxsd "#rtype"31, "#rtype"7, "#rtype"7 \n\t" \ + "vpmaxsd "#rtype"31, "#rtype"8, "#rtype"8 \n\t" \ + "vpmaxsd "#rtype"31, "#rtype"9, "#rtype"9 \n\t" \ + "vpmaxsd "#rtype"31, "#rtype"10, "#rtype"10 \n\t" \ + "vpmaxsd "#rtype"31, "#rtype"11, "#rtype"11 \n\t" + +#define relu24Regs(rtype) \ + relu12Regs(rtype) \ + "vpmaxsd "#rtype"31, "#rtype"12, "#rtype"12 \n\t" \ + "vpmaxsd "#rtype"31, "#rtype"13, "#rtype"13 \n\t" \ + "vpmaxsd "#rtype"31, "#rtype"14, "#rtype"14 \n\t" \ + "vpmaxsd "#rtype"31, "#rtype"15, "#rtype"15 \n\t" \ + "vpmaxsd "#rtype"31, "#rtype"16, "#rtype"16 \n\t" \ + "vpmaxsd "#rtype"31, "#rtype"17, "#rtype"17 \n\t" \ + "vpmaxsd "#rtype"31, "#rtype"18, "#rtype"18 \n\t" \ + "vpmaxsd "#rtype"31, "#rtype"19, "#rtype"19 \n\t" \ + "vpmaxsd "#rtype"31, "#rtype"20, "#rtype"20 \n\t" \ + "vpmaxsd "#rtype"31, "#rtype"21, "#rtype"21 \n\t" \ + "vpmaxsd "#rtype"31, "#rtype"22, "#rtype"22 \n\t" \ + "vpmaxsd "#rtype"31, "#rtype"23, "#rtype"23 \n\t" + + +#define reluRegPs(rtype) \ + "vpxord "#rtype"31, "#rtype"31, "#rtype"31 \n\t" \ + "vmaxps "#rtype"31, "#rtype"0, "#rtype"0 \n\t" + +#define relu2RegsPs(rtype) \ + reluReg(rtype) \ + "vmaxps "#rtype"31, "#rtype"1, "#rtype"1 \n\t" + +#define relu3RegsPs(rtype) \ + relu2Regs(rtype) \ + "vmaxps "#rtype"31, "#rtype"2, "#rtype"2 \n\t" + +#define relu12RegsPs(rtype) \ + relu3Regs(rtype) \ + "vmaxps "#rtype"31, "#rtype"3, "#rtype"3 \n\t" \ + "vmaxps "#rtype"31, "#rtype"4, "#rtype"4 \n\t" \ + "vmaxps "#rtype"31, "#rtype"5, "#rtype"5 \n\t" \ + "vmaxps "#rtype"31, "#rtype"6, "#rtype"6 \n\t" \ + "vmaxps "#rtype"31, "#rtype"7, "#rtype"7 \n\t" \ + "vmaxps "#rtype"31, "#rtype"8, "#rtype"8 \n\t" \ + "vmaxps "#rtype"31, "#rtype"9, "#rtype"9 \n\t" \ + "vmaxps "#rtype"31, "#rtype"10, "#rtype"10 \n\t" \ + "vmaxps "#rtype"31, "#rtype"11, "#rtype"11 \n\t" + +#define relu24RegsPs(rtype) \ + relu12Regs(rtype) \ + "vmaxps "#rtype"31, "#rtype"12, "#rtype"12 \n\t" \ + "vmaxps "#rtype"31, "#rtype"13, "#rtype"13 \n\t" \ + "vmaxps "#rtype"31, "#rtype"14, "#rtype"14 \n\t" \ + "vmaxps "#rtype"31, "#rtype"15, "#rtype"15 \n\t" \ + "vmaxps "#rtype"31, "#rtype"16, "#rtype"16 \n\t" \ + "vmaxps "#rtype"31, "#rtype"17, "#rtype"17 \n\t" \ + "vmaxps "#rtype"31, "#rtype"18, "#rtype"18 \n\t" \ + "vmaxps "#rtype"31, "#rtype"19, "#rtype"19 \n\t" \ + "vmaxps "#rtype"31, "#rtype"20, "#rtype"20 \n\t" \ + "vmaxps "#rtype"31, "#rtype"21, "#rtype"21 \n\t" \ + "vmaxps "#rtype"31, "#rtype"22, "#rtype"22 \n\t" \ + "vmaxps "#rtype"31, "#rtype"23, "#rtype"23 \n\t" + +#define convertRegI32ToF32(scalePtr, rtype) \ + "vbroadcastss ("#scalePtr"), "#rtype"24 \n\t" \ + "vcvtdq2ps "#rtype"0, "#rtype"0 \n\t" \ + "vmulps "#rtype"0, "#rtype"24, "#rtype"0 \n\t" \ + +#define convert2RegsI32ToF32(scalePtr, rtype) \ + "vbroadcastss ("#scalePtr"), "#rtype"24 \n\t" \ + "vcvtdq2ps "#rtype"0, "#rtype"0 \n\t" \ + "vcvtdq2ps "#rtype"1, "#rtype"1 \n\t" \ + "vmulps "#rtype"0, "#rtype"24, "#rtype"0 \n\t" \ + "vmulps "#rtype"1, "#rtype"24, "#rtype"1 \n\t" \ + +#define convert3RegsI32ToF32(scalePtr, rtype) \ + "vbroadcastss ("#scalePtr"), "#rtype"24 \n\t" \ + "vcvtdq2ps "#rtype"0, "#rtype"0 \n\t" \ + "vcvtdq2ps "#rtype"1, "#rtype"1 \n\t" \ + "vcvtdq2ps "#rtype"2, "#rtype"2 \n\t" \ + "vmulps "#rtype"0, "#rtype"24, "#rtype"0 \n\t" \ + "vmulps "#rtype"1, "#rtype"24, "#rtype"1 \n\t" \ + "vmulps "#rtype"2, "#rtype"24, "#rtype"2 \n\t" +#define convert12RegsI32ToF32(scalePtr, rtype) \ + "vbroadcastss ("#scalePtr"), "#rtype"24 \n\t" \ + "vcvtdq2ps "#rtype"0, "#rtype"0 \n\t" \ + "vcvtdq2ps "#rtype"1, "#rtype"1 \n\t" \ + "vcvtdq2ps "#rtype"2, "#rtype"2 \n\t" \ + "vcvtdq2ps "#rtype"3, "#rtype"3 \n\t" \ + "vcvtdq2ps "#rtype"4, "#rtype"4 \n\t" \ + "vcvtdq2ps "#rtype"5, "#rtype"5 \n\t" \ + "vcvtdq2ps "#rtype"6, "#rtype"6 \n\t" \ + "vcvtdq2ps "#rtype"7, "#rtype"7 \n\t" \ + "vcvtdq2ps "#rtype"8, "#rtype"8 \n\t" \ + "vcvtdq2ps "#rtype"9, "#rtype"9 \n\t" \ + "vcvtdq2ps "#rtype"10, "#rtype"10 \n\t" \ + "vcvtdq2ps "#rtype"11, "#rtype"11 \n\t" \ + "vmulps "#rtype"0, "#rtype"24, "#rtype"0 \n\t" \ + "vmulps "#rtype"1, "#rtype"24, "#rtype"1 \n\t" \ + "vmulps "#rtype"2, "#rtype"24, "#rtype"2 \n\t" \ + "vmulps "#rtype"3, "#rtype"24, "#rtype"3 \n\t" \ + "vmulps "#rtype"4, "#rtype"24, "#rtype"4 \n\t" \ + "vmulps "#rtype"5, "#rtype"24, "#rtype"5 \n\t" \ + "vmulps "#rtype"6, "#rtype"24, "#rtype"6 \n\t" \ + "vmulps "#rtype"7, "#rtype"24, "#rtype"7 \n\t" \ + "vmulps "#rtype"8, "#rtype"24, "#rtype"8 \n\t" \ + "vmulps "#rtype"9, "#rtype"24, "#rtype"9 \n\t" \ + "vmulps "#rtype"10, "#rtype"24, "#rtype"10 \n\t" \ + "vmulps "#rtype"11, "#rtype"24, "#rtype"11 \n\t" + +#define convert24RegsI32ToF32(scalePtr, rtype) \ + convert12RegsI32ToF32(scalePtr, rtype) \ + "vcvtdq2ps "#rtype"12, "#rtype"12 \n\t" \ + "vcvtdq2ps "#rtype"13, "#rtype"13 \n\t" \ + "vcvtdq2ps "#rtype"14, "#rtype"14 \n\t" \ + "vcvtdq2ps "#rtype"15, "#rtype"15 \n\t" \ + "vcvtdq2ps "#rtype"16, "#rtype"16 \n\t" \ + "vcvtdq2ps "#rtype"17, "#rtype"17 \n\t" \ + "vcvtdq2ps "#rtype"18, "#rtype"18 \n\t" \ + "vcvtdq2ps "#rtype"19, "#rtype"19 \n\t" \ + "vcvtdq2ps "#rtype"20, "#rtype"20 \n\t" \ + "vcvtdq2ps "#rtype"21, "#rtype"21 \n\t" \ + "vcvtdq2ps "#rtype"22, "#rtype"22 \n\t" \ + "vcvtdq2ps "#rtype"23, "#rtype"23 \n\t" \ + "vmulps "#rtype"12, "#rtype"24, "#rtype"12 \n\t" \ + "vmulps "#rtype"13, "#rtype"24, "#rtype"13 \n\t" \ + "vmulps "#rtype"14, "#rtype"24, "#rtype"14 \n\t" \ + "vmulps "#rtype"15, "#rtype"24, "#rtype"15 \n\t" \ + "vmulps "#rtype"16, "#rtype"24, "#rtype"16 \n\t" \ + "vmulps "#rtype"17, "#rtype"24, "#rtype"17 \n\t" \ + "vmulps "#rtype"18, "#rtype"24, "#rtype"18 \n\t" \ + "vmulps "#rtype"19, "#rtype"24, "#rtype"19 \n\t" \ + "vmulps "#rtype"20, "#rtype"24, "#rtype"20 \n\t" \ + "vmulps "#rtype"21, "#rtype"24, "#rtype"21 \n\t" \ + "vmulps "#rtype"22, "#rtype"24, "#rtype"22 \n\t" \ + "vmulps "#rtype"23, "#rtype"24, "#rtype"23 \n\t" + +#define load48BiasTo3Regs(bias) \ + "vmovups ("#bias"), %%zmm0 \n\t" \ + "vmovups 0x40("#bias"), %%zmm1 \n\t" \ + "vmovups 0x80("#bias"), %%zmm2 \n\t" \ + +#define load48BiasTo12Regs(bias) \ + load48BiasTo3Regs(bias) \ + "vmovups %%zmm0, %%zmm3 \n\t" \ + "vmovups %%zmm1, %%zmm4 \n\t" \ + "vmovups %%zmm2, %%zmm5 \n\t" \ + "vmovups %%zmm0, %%zmm6 \n\t" \ + "vmovups %%zmm1, %%zmm7 \n\t" \ + "vmovups %%zmm2, %%zmm8 \n\t" \ + "vmovups %%zmm0, %%zmm9 \n\t" \ + "vmovups %%zmm1, %%zmm10 \n\t" \ + "vmovups %%zmm2, %%zmm11 \n\t" + +#define load48BiasTo24Regs(bias) \ + load48BiasTo12Regs(bias) \ + "vmovups %%zmm0, %%zmm12 \n\t" \ + "vmovups %%zmm1, %%zmm13 \n\t" \ + "vmovups %%zmm2, %%zmm14 \n\t" \ + "vmovups %%zmm0, %%zmm15 \n\t" \ + "vmovups %%zmm1, %%zmm16 \n\t" \ + "vmovups %%zmm2, %%zmm17 \n\t" \ + "vmovups %%zmm0, %%zmm18 \n\t" \ + "vmovups %%zmm1, %%zmm19 \n\t" \ + "vmovups %%zmm2, %%zmm20 \n\t" \ + "vmovups %%zmm0, %%zmm21 \n\t" \ + "vmovups %%zmm1, %%zmm22 \n\t" \ + "vmovups %%zmm2, %%zmm23 \n\t" + +#define load32BiasTo2Regs(bias) \ + "vmovups ("#bias"), %%zmm0 \n\t" \ + "vmovups 0x40("#bias"), %%zmm1 \n\t" \ + +#define load32BiasTo12Regs(bias) \ + load32BiasTo2Regs(bias) \ + "vmovups %%zmm0, %%zmm2 \n\t" \ + "vmovups %%zmm1, %%zmm3 \n\t" \ + "vmovups %%zmm0, %%zmm4 \n\t" \ + "vmovups %%zmm1, %%zmm5 \n\t" \ + "vmovups %%zmm0, %%zmm6 \n\t" \ + "vmovups %%zmm1, %%zmm7 \n\t" \ + "vmovups %%zmm0, %%zmm8 \n\t" \ + "vmovups %%zmm1, %%zmm9 \n\t" \ + "vmovups %%zmm0, %%zmm10 \n\t" \ + "vmovups %%zmm1, %%zmm11 \n\t" + +#define load32BiasTo24Regs(bias) \ + load32BiasTo12Regs(bias) \ + "vmovups %%zmm0, %%zmm12 \n\t" \ + "vmovups %%zmm1, %%zmm13 \n\t" \ + "vmovups %%zmm0, %%zmm14 \n\t" \ + "vmovups %%zmm1, %%zmm15 \n\t" \ + "vmovups %%zmm0, %%zmm16 \n\t" \ + "vmovups %%zmm1, %%zmm17 \n\t" \ + "vmovups %%zmm0, %%zmm18 \n\t" \ + "vmovups %%zmm1, %%zmm19 \n\t" \ + "vmovups %%zmm0, %%zmm20 \n\t" \ + "vmovups %%zmm1, %%zmm21 \n\t" \ + "vmovups %%zmm0, %%zmm22 \n\t" \ + "vmovups %%zmm1, %%zmm23 \n\t" + +#define load16BiasTo1Regs(bias, rtype) \ + "vmovups ("#bias"), "#rtype"0 \n\t" + +#define load16BiasTo12Regs(bias, rtype) \ + load16BiasTo1Regs(bias, rtype) \ + "vmovups "#rtype"0, "#rtype"1 \n\t" \ + "vmovups "#rtype"0, "#rtype"2 \n\t" \ + "vmovups "#rtype"0, "#rtype"3 \n\t" \ + "vmovups "#rtype"0, "#rtype"4 \n\t" \ + "vmovups "#rtype"0, "#rtype"5 \n\t" \ + "vmovups "#rtype"0, "#rtype"6 \n\t" \ + "vmovups "#rtype"0, "#rtype"7 \n\t" \ + "vmovups "#rtype"0, "#rtype"8 \n\t" \ + "vmovups "#rtype"0, "#rtype"9 \n\t" \ + "vmovups "#rtype"0, "#rtype"10 \n\t" \ + "vmovups "#rtype"0, "#rtype"11 \n\t" + +#define load16BiasTo24Regs(bias, rtype) \ + load16BiasTo12Regs(bias, rtype) \ + "vmovups "#rtype"0, "#rtype"12 \n\t" \ + "vmovups "#rtype"0, "#rtype"13 \n\t" \ + "vmovups "#rtype"0, "#rtype"14 \n\t" \ + "vmovups "#rtype"0, "#rtype"15 \n\t" \ + "vmovups "#rtype"0, "#rtype"16 \n\t" \ + "vmovups "#rtype"0, "#rtype"17 \n\t" \ + "vmovups "#rtype"0, "#rtype"18 \n\t" \ + "vmovups "#rtype"0, "#rtype"19 \n\t" \ + "vmovups "#rtype"0, "#rtype"20 \n\t" \ + "vmovups "#rtype"0, "#rtype"21 \n\t" \ + "vmovups "#rtype"0, "#rtype"22 \n\t" \ + "vmovups "#rtype"0, "#rtype"23 \n\t" diff --git a/compute/tensor/src/cpu/x86/int8/convolution_transform.cpp b/compute/tensor/src/cpu/x86/int8/convolution_transform.cpp index b45767d0..f48cce31 100644 --- a/compute/tensor/src/cpu/x86/int8/convolution_transform.cpp +++ b/compute/tensor/src/cpu/x86/int8/convolution_transform.cpp @@ -110,7 +110,7 @@ inline EE convolution_transform_filter_kernel_int8(TensorDesc filterDesc, CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); if (fdf == ftmDataFormat) { *ftmDesc = filterDesc; - memcpy(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); + UNI_MEMCPY(ftmArray, filterArray, fn * fc * fh * fw * bytesOf(fdt)); return SUCCESS; } if (fdf != DF_NCHW) { diff --git a/compute/tensor/src/cpu/x86/int8/depthwise_convolution_direct.cpp b/compute/tensor/src/cpu/x86/int8/depthwise_convolution_direct.cpp new file mode 100644 index 00000000..8b313ce2 --- /dev/null +++ b/compute/tensor/src/cpu/x86/int8/depthwise_convolution_direct.cpp @@ -0,0 +1,596 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "uni.h" +#include "cpu/x86/fp32/convolution_functions.h" +#include "cpu/x86/tensor_computing_x86.h" + +#define UNROLL_W 4 +#define UNROLL_OC_BLOCK_DIM 16 +#define SIMDW 16 + +struct ConvController { + UINT8 *input; + const INT8 *filter; + void *output; + F32 *eltwise; + UINT8 *u8Output; + const I32 *bias; + I64 ic; + I64 kw; + I64 kh; + I64 *stepC16; + I64 ostepC16; + I64 flags; + I64 fStep; + I64 hStep; + I64 stride; + I64 k4Num; + void *scale; +}; + +typedef void (*kernelFunc)(ConvController &c); + +void Avx512DepthConvKernel16x16(ConvController &c) { + __asm__ __volatile__("prefetcht0 (%[output]) \n\t" + "prefetcht0 0x40(%[output]) \n\t" + "prefetcht0 0x80(%[output]) \n\t" + "prefetcht0 0xC0(%[output]) \n\t" + "prefetcht0 0x100(%[output]) \n\t" + "prefetcht0 0x140(%[output]) \n\t" + "prefetcht0 0x180(%[output]) \n\t" + "prefetcht0 0x1C0(%[output]) \n\t" + "vmovups (%[bias]), %%zmm0 \n\t" + "vmovups %%zmm0, %%zmm1 \n\t" + "vmovups %%zmm0, %%zmm2 \n\t" + "vmovups %%zmm0, %%zmm3 \n\t" + "vmovups %%zmm0, %%zmm4 \n\t" + "vmovups %%zmm0, %%zmm5 \n\t" + "vmovups %%zmm0, %%zmm6 \n\t" + "vmovups %%zmm0, %%zmm7 \n\t" + "vmovups %%zmm0, %%zmm8 \n\t" + "vmovups %%zmm0, %%zmm9 \n\t" + "vmovups %%zmm0, %%zmm10 \n\t" + "vmovups %%zmm0, %%zmm11 \n\t" + "vmovups %%zmm0, %%zmm12 \n\t" + "vmovups %%zmm0, %%zmm13 \n\t" + "vmovups %%zmm0, %%zmm14 \n\t" + "vmovups %%zmm0, %%zmm15 \n\t" + : + : [bias] "r" (c.bias), [flags] "r" (c.flags), [output] "r" (c.output) + : "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", + "%zmm7", "memory", "cc"); + + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + "vmovups (%[filter]), %%zmm16 \n\t" + "vmovups (%[input]), %%zmm17 \n\t" + "vmovups 0x40(%[input]), %%zmm18 \n\t" + "vmovups 0x80(%[input]), %%zmm19 \n\t" + "vmovups 0xC0(%[input]), %%zmm20 \n\t" + "vpdpbusd %%zmm16, %%zmm17, %%zmm0 \n\t" + "vpdpbusd %%zmm16, %%zmm18, %%zmm1 \n\t" + "vpdpbusd %%zmm16, %%zmm19, %%zmm2 \n\t" + "vpdpbusd %%zmm16, %%zmm20, %%zmm3 \n\t" + "vmovups 0x100(%[input]), %%zmm21 \n\t" + "vmovups 0x140(%[input]), %%zmm22 \n\t" + "vmovups 0x180(%[input]), %%zmm23 \n\t" + "vmovups 0x1C0(%[input]), %%zmm24 \n\t" + "vpdpbusd %%zmm16, %%zmm21, %%zmm4 \n\t" + "vpdpbusd %%zmm16, %%zmm22, %%zmm5 \n\t" + "vpdpbusd %%zmm16, %%zmm23, %%zmm6 \n\t" + "vpdpbusd %%zmm16, %%zmm24, %%zmm7 \n\t" + "vmovups 0x200(%[input]), %%zmm25 \n\t" + "vmovups 0x240(%[input]), %%zmm26 \n\t" + "vmovups 0x280(%[input]), %%zmm27 \n\t" + "vmovups 0x2C0(%[input]), %%zmm28 \n\t" + "vpdpbusd %%zmm16, %%zmm25, %%zmm8 \n\t" + "vpdpbusd %%zmm16, %%zmm26, %%zmm9 \n\t" + "vpdpbusd %%zmm16, %%zmm27, %%zmm10 \n\t" + "vpdpbusd %%zmm16, %%zmm28, %%zmm11 \n\t" + "vmovups 0x300(%[input]), %%zmm17 \n\t" + "vmovups 0x340(%[input]), %%zmm18 \n\t" + "vmovups 0x380(%[input]), %%zmm19 \n\t" + "vmovups 0x3C0(%[input]), %%zmm20 \n\t" + "vpdpbusd %%zmm16, %%zmm17, %%zmm12 \n\t" + "vpdpbusd %%zmm16, %%zmm18, %%zmm13 \n\t" + "vpdpbusd %%zmm16, %%zmm19, %%zmm14 \n\t" + "vpdpbusd %%zmm16, %%zmm20, %%zmm15 \n\t" + "addq $0x40, %[filter] \n\t" + "addq %[hStep], %[input] \n\t" + "dec %%rcx \n\t" + "jg 0b \n\t" + : [input] "+r" (c.input), [filter] "+r" (c.filter) + : [k4Num] "c" (c.k4Num), [stride] "r" (c.stride), [hStep] "r" (c.hStep) + : "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", + "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", + "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", + "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", + "%zmm31", "memory", "cc"); + + __asm__ __volatile__("cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + "vpxord %%zmm31, %%zmm31, %%zmm31 \n\t" + "vpmaxsd %%zmm31, %%zmm0, %%zmm0 \n\t" + "vpmaxsd %%zmm31, %%zmm1, %%zmm1 \n\t" + "vpmaxsd %%zmm31, %%zmm2, %%zmm2 \n\t" + "vpmaxsd %%zmm31, %%zmm3, %%zmm3 \n\t" + "vpmaxsd %%zmm31, %%zmm4, %%zmm4 \n\t" + "vpmaxsd %%zmm31, %%zmm5, %%zmm5 \n\t" + "vpmaxsd %%zmm31, %%zmm6, %%zmm6 \n\t" + "vpmaxsd %%zmm31, %%zmm7, %%zmm7 \n\t" + "vpmaxsd %%zmm31, %%zmm8, %%zmm8 \n\t" + "vpmaxsd %%zmm31, %%zmm9, %%zmm9 \n\t" + "vpmaxsd %%zmm31, %%zmm10, %%zmm10 \n\t" + "vpmaxsd %%zmm31, %%zmm11, %%zmm11 \n\t" + "vpmaxsd %%zmm31, %%zmm12, %%zmm12 \n\t" + "vpmaxsd %%zmm31, %%zmm13, %%zmm13 \n\t" + "vpmaxsd %%zmm31, %%zmm14, %%zmm14 \n\t" + "vpmaxsd %%zmm31, %%zmm15, %%zmm15 \n\t" + "jmp 4f \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vbroadcastss (%[scale]), %%zmm30 \n\t" + "vcvtdq2ps %%zmm30, %%zmm31 \n\t" + "vmulps %%zmm31, %%zmm0, %%zmm0 \n\t" + "vmulps %%zmm31, %%zmm1, %%zmm1 \n\t" + "vmulps %%zmm31, %%zmm2, %%zmm2 \n\t" + "vmulps %%zmm31, %%zmm3, %%zmm3 \n\t" + "vmulps %%zmm31, %%zmm4, %%zmm4 \n\t" + "vmulps %%zmm31, %%zmm5, %%zmm5 \n\t" + "vmulps %%zmm31, %%zmm6, %%zmm6 \n\t" + "vmulps %%zmm31, %%zmm7, %%zmm7 \n\t" + "vmulps %%zmm31, %%zmm8, %%zmm8 \n\t" + "vmulps %%zmm31, %%zmm9, %%zmm9 \n\t" + "vmulps %%zmm31, %%zmm10, %%zmm10 \n\t" + "vmulps %%zmm31, %%zmm11, %%zmm11 \n\t" + "vmulps %%zmm31, %%zmm12, %%zmm12 \n\t" + "vmulps %%zmm31, %%zmm13, %%zmm13 \n\t" + "vmulps %%zmm31, %%zmm14, %%zmm14 \n\t" + "vmulps %%zmm31, %%zmm15, %%zmm15 \n\t" + + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + "vaddps 0x40(%[eltwise]), %%zmm1, %%zmm1 \n\t" + "vaddps 0x80(%[eltwise]), %%zmm2, %%zmm2 \n\t" + "vaddps 0xC0(%[eltwise]), %%zmm3, %%zmm3 \n\t" + "vaddps 0x100(%[eltwise]), %%zmm4, %%zmm4 \n\t" + "vaddps 0x140(%[eltwise]), %%zmm5, %%zmm5 \n\t" + "vaddps 0x180(%[eltwise]), %%zmm6, %%zmm6 \n\t" + "vaddps 0x1C0(%[eltwise]), %%zmm7, %%zmm7 \n\t" + "vaddps 0x200(%[eltwise]), %%zmm8, %%zmm8 \n\t" + "vaddps 0x240(%[eltwise]), %%zmm9, %%zmm9 \n\t" + "vaddps 0x280(%[eltwise]), %%zmm10, %%zmm10 \n\t" + "vaddps 0x2C0(%[eltwise]), %%zmm11, %%zmm11 \n\t" + "vaddps 0x300(%[eltwise]), %%zmm12, %%zmm12 \n\t" + "vaddps 0x340(%[eltwise]), %%zmm13, %%zmm13 \n\t" + "vaddps 0x380(%[eltwise]), %%zmm14, %%zmm14 \n\t" + "vaddps 0x3C0(%[eltwise]), %%zmm15, %%zmm15 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + "vpxord %%zmm31, %%zmm31, %%zmm31 \n\t" + "vmaxps %%zmm31, %%zmm0, %%zmm0 \n\t" + "vmaxps %%zmm31, %%zmm1, %%zmm1 \n\t" + "vmaxps %%zmm31, %%zmm2, %%zmm2 \n\t" + "vmaxps %%zmm31, %%zmm3, %%zmm3 \n\t" + "vmaxps %%zmm31, %%zmm4, %%zmm4 \n\t" + "vmaxps %%zmm31, %%zmm5, %%zmm5 \n\t" + "vmaxps %%zmm31, %%zmm6, %%zmm6 \n\t" + "vmaxps %%zmm31, %%zmm7, %%zmm7 \n\t" + "vmaxps %%zmm31, %%zmm8, %%zmm8 \n\t" + "vmaxps %%zmm31, %%zmm9, %%zmm9 \n\t" + "vmaxps %%zmm31, %%zmm10, %%zmm10 \n\t" + "vmaxps %%zmm31, %%zmm11, %%zmm11 \n\t" + "vmaxps %%zmm31, %%zmm12, %%zmm12 \n\t" + "vmaxps %%zmm31, %%zmm13, %%zmm13 \n\t" + "vmaxps %%zmm31, %%zmm14, %%zmm14 \n\t" + "vmaxps %%zmm31, %%zmm15, %%zmm15 \n\t" + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%zmm0, (%[output]) \n\t" + "vmovups %%zmm1, 0x40(%[output]) \n\t" + "vmovups %%zmm2, 0x80(%[output]) \n\t" + "vmovups %%zmm3, 0xC0(%[output]) \n\t" + "vmovups %%zmm4, 0x100(%[output]) \n\t" + "vmovups %%zmm5, 0x140(%[output]) \n\t" + "vmovups %%zmm6, 0x180(%[output]) \n\t" + "vmovups %%zmm7, 0x1C0(%[output]) \n\t" + "vmovups %%zmm8, 0x200(%[output]) \n\t" + "vmovups %%zmm9, 0x240(%[output]) \n\t" + "vmovups %%zmm10, 0x280(%[output]) \n\t" + "vmovups %%zmm11, 0x2C0(%[output]) \n\t" + "vmovups %%zmm12, 0x300(%[output]) \n\t" + "vmovups %%zmm13, 0x340(%[output]) \n\t" + "vmovups %%zmm14, 0x380(%[output]) \n\t" + "vmovups %%zmm15, 0x3C0(%[output]) \n\t" + : + : [output] "r" (c.output), [eltwise] "r" (c.eltwise), [ostepC16] "r" (c.ostepC16), + [flags] "r" (c.flags), [scale] "r" (c.scale) + : "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", + "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", + "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", + "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", + "%zmm31", "memory", "cc"); +} + +void Avx512DepthConvKernel8x16(ConvController &c) { + __asm__ __volatile__("prefetcht0 (%[output]) \n\t" + "prefetcht0 0x40(%[output]) \n\t" + "prefetcht0 0x80(%[output]) \n\t" + "prefetcht0 0xC0(%[output]) \n\t" + "prefetcht0 0x100(%[output]) \n\t" + "prefetcht0 0x140(%[output]) \n\t" + "prefetcht0 0x180(%[output]) \n\t" + "prefetcht0 0x1C0(%[output]) \n\t" + "vmovups (%[bias]), %%zmm0 \n\t" + "vmovups %%zmm0, %%zmm1 \n\t" + "vmovups %%zmm0, %%zmm2 \n\t" + "vmovups %%zmm0, %%zmm3 \n\t" + "vmovups %%zmm0, %%zmm4 \n\t" + "vmovups %%zmm0, %%zmm5 \n\t" + "vmovups %%zmm0, %%zmm6 \n\t" + "vmovups %%zmm0, %%zmm7 \n\t" + : + : [bias] "r" (c.bias), [flags] "r" (c.flags), [output] "r" (c.output) + : "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", + "%zmm7", "memory", "cc"); + + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + "vmovups (%[filter]), %%zmm16 \n\t" + "vmovups (%[input]), %%zmm17 \n\t" + "vmovups 0x40(%[input]), %%zmm18 \n\t" + "vmovups 0x80(%[input]), %%zmm19 \n\t" + "vmovups 0xC0(%[input]), %%zmm20 \n\t" + "vpdpbusd %%zmm16, %%zmm17, %%zmm0 \n\t" + "vpdpbusd %%zmm16, %%zmm18, %%zmm1 \n\t" + "vpdpbusd %%zmm16, %%zmm19, %%zmm2 \n\t" + "vpdpbusd %%zmm16, %%zmm20, %%zmm3 \n\t" + "vmovups 0x100(%[input]), %%zmm21 \n\t" + "vmovups 0x140(%[input]), %%zmm22 \n\t" + "vmovups 0x180(%[input]), %%zmm23 \n\t" + "vmovups 0x1C0(%[input]), %%zmm24 \n\t" + "vpdpbusd %%zmm16, %%zmm21, %%zmm4 \n\t" + "vpdpbusd %%zmm16, %%zmm22, %%zmm5 \n\t" + "vpdpbusd %%zmm16, %%zmm23, %%zmm6 \n\t" + "vpdpbusd %%zmm16, %%zmm24, %%zmm7 \n\t" + "addq $0x40, %[filter] \n\t" + "addq %[hStep], %[input] \n\t" + "dec %%rcx \n\t" + "jg 0b \n\t" + : [input] "+r" (c.input), [filter] "+r" (c.filter) + : [k4Num] "c" (c.k4Num), [stride] "r" (c.stride), [hStep] "r" (c.hStep) + : "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", + "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", + "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", + "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", + "%zmm31", "memory", "cc"); + + __asm__ __volatile__("cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + "vpxord %%zmm31, %%zmm31, %%zmm31 \n\t" + "vpmaxsd %%zmm31, %%zmm0, %%zmm0 \n\t" + "vpmaxsd %%zmm31, %%zmm1, %%zmm1 \n\t" + "vpmaxsd %%zmm31, %%zmm2, %%zmm2 \n\t" + "vpmaxsd %%zmm31, %%zmm3, %%zmm3 \n\t" + "vpmaxsd %%zmm31, %%zmm4, %%zmm4 \n\t" + "vpmaxsd %%zmm31, %%zmm5, %%zmm5 \n\t" + "vpmaxsd %%zmm31, %%zmm6, %%zmm6 \n\t" + "vpmaxsd %%zmm31, %%zmm7, %%zmm7 \n\t" + "jmp 4f \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vbroadcastss (%[scale]), %%zmm30 \n\t" + "vcvtdq2ps %%zmm30, %%zmm31 \n\t" + "vmulps %%zmm31, %%zmm0, %%zmm0 \n\t" + "vmulps %%zmm31, %%zmm1, %%zmm1 \n\t" + "vmulps %%zmm31, %%zmm2, %%zmm2 \n\t" + "vmulps %%zmm31, %%zmm3, %%zmm3 \n\t" + "vmulps %%zmm31, %%zmm4, %%zmm4 \n\t" + "vmulps %%zmm31, %%zmm5, %%zmm5 \n\t" + "vmulps %%zmm31, %%zmm6, %%zmm6 \n\t" + "vmulps %%zmm31, %%zmm7, %%zmm7 \n\t" + + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + "vaddps 0x40(%[eltwise]), %%zmm1, %%zmm1 \n\t" + "vaddps 0x80(%[eltwise]), %%zmm2, %%zmm2 \n\t" + "vaddps 0xC0(%[eltwise]), %%zmm3, %%zmm3 \n\t" + "vaddps 0x100(%[eltwise]), %%zmm4, %%zmm4 \n\t" + "vaddps 0x140(%[eltwise]), %%zmm5, %%zmm5 \n\t" + "vaddps 0x180(%[eltwise]), %%zmm6, %%zmm6 \n\t" + "vaddps 0x1C0(%[eltwise]), %%zmm7, %%zmm7 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + "vpxord %%zmm31, %%zmm31, %%zmm31 \n\t" + "vmaxps %%zmm31, %%zmm0, %%zmm0 \n\t" + "vmaxps %%zmm31, %%zmm1, %%zmm1 \n\t" + "vmaxps %%zmm31, %%zmm2, %%zmm2 \n\t" + "vmaxps %%zmm31, %%zmm3, %%zmm3 \n\t" + "vmaxps %%zmm31, %%zmm4, %%zmm4 \n\t" + "vmaxps %%zmm31, %%zmm5, %%zmm5 \n\t" + "vmaxps %%zmm31, %%zmm6, %%zmm6 \n\t" + "vmaxps %%zmm31, %%zmm7, %%zmm7 \n\t" + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%zmm0, (%[output]) \n\t" + "vmovups %%zmm1, 0x40(%[output]) \n\t" + "vmovups %%zmm2, 0x80(%[output]) \n\t" + "vmovups %%zmm3, 0xC0(%[output]) \n\t" + "vmovups %%zmm4, 0x100(%[output]) \n\t" + "vmovups %%zmm5, 0x140(%[output]) \n\t" + "vmovups %%zmm6, 0x180(%[output]) \n\t" + "vmovups %%zmm7, 0x1C0(%[output]) \n\t" + : + : [output] "r" (c.output), [eltwise] "r" (c.eltwise), [ostepC16] "r" (c.ostepC16), + [flags] "r" (c.flags), [scale] "r" (c.scale) + : "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", + "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", + "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", + "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", + "%zmm31", "memory", "cc"); +} + +void Avx512DepthConvKernel1x16(ConvController &c) { + __asm__ __volatile__("prefetcht0 (%[output]) \n\t" + "vmovups (%[bias]), %%zmm0 \n\t" + : + : [bias] "r" (c.bias), [flags] "r" (c.flags), [output] "r" (c.output) + : "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", + "%zmm7", "memory", "cc"); + + __asm__ __volatile__(".align 16 \n\t" + "0: \n\t" + "vmovups (%[filter]), %%zmm16 \n\t" + "vmovups (%[input]), %%zmm17 \n\t" + "vpdpbusd %%zmm16, %%zmm17, %%zmm0 \n\t" + "addq $0x40, %[filter] \n\t" + "addq %[hStep], %[input] \n\t" + "dec %%rcx \n\t" + "jg 0b \n\t" + : [input] "+r" (c.input), [filter] "+r" (c.filter) + : [k4Num] "c" (c.k4Num), [stride] "r" (c.stride), [hStep] "r" (c.hStep) + : "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", + "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", + "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", + "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", + "%zmm31", "memory", "cc"); + + __asm__ __volatile__("cmpq $0x0, %[scale] \n\t" + "jne 1f \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + "vpxord %%zmm31, %%zmm31, %%zmm31 \n\t" + "vpmaxsd %%zmm31, %%zmm0, %%zmm0 \n\t" + "jmp 4f \n\t" + + ".align 16 \n\t" + "1: \n\t" + "vbroadcastss (%[scale]), %%zmm30 \n\t" + "vcvtdq2ps %%zmm30, %%zmm31 \n\t" + "vmulps %%zmm31, %%zmm0, %%zmm0 \n\t" + + ".align 16 \n\t" + "2: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0x2, %%rcx \n\t" + "je 3f \n\t" + "vaddps (%[eltwise]), %%zmm0, %%zmm0 \n\t" + + ".align 16 \n\t" + "3: \n\t" + "movq %[flags], %%rcx \n\t" + "and $0xC, %%rcx \n\t" + "je 4f \n\t" + "vpxord %%zmm31, %%zmm31, %%zmm31 \n\t" + "vmaxps %%zmm31, %%zmm0, %%zmm0 \n\t" + + ".align 16 \n\t" + "4: \n\t" + "vmovups %%zmm0, (%[output]) \n\t" + : + : [output] "r" (c.output), [eltwise] "r" (c.eltwise), [ostepC16] "r" (c.ostepC16), + [flags] "r" (c.flags), [scale] "r" (c.scale) + : "%rax", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm5", "%zmm6", + "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "%zmm14", + "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19", "%zmm20", "%zmm21", "%zmm22", + "%zmm23", "%zmm24", "%zmm25", "%zmm26", "%zmm27", "%zmm28", "%zmm29", "%zmm30", + "%zmm31", "memory", "cc"); +} + +EE depthwise_pointwise_convolution_int8(TensorDesc inputDesc, + UINT8 *inArray, + F32 *eltwiseInput, + TensorDesc dwFilterDesc, + const INT8 *dwFilterArray, + TensorDesc pwFilterDesc, + const INT8 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F32 *dwBiasArray, + TensorDesc pwBiasDesc, + const F32 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *outArray, + F32 *scale, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec) +{ + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + I32 in, ic, ih, iw; + I32 fn, fc, fh, fw; + I32 on, oc, oh, ow; + CHECK_STATUS(tensor4dGetI32(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGetI32(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + CHECK_STATUS(tensor4dGetI32(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if ((idf != DF_NCHWC16) || (ic % 16 != 0)) { + CHECK_STATUS(NOT_MATCH); + } + + // get computing params + I32 strideH = convParamSpec.stride_h; + I32 strideW = convParamSpec.stride_w; + I32 paddingT = convParamSpec.pad_top; + I32 paddingB = convParamSpec.pad_bottom; + I32 paddingL = convParamSpec.pad_left; + I32 paddingR = convParamSpec.pad_right; + I32 dilateH = convParamSpec.dilatedRate_h; + I32 dilateW = convParamSpec.dilatedRate_w; + I32 fhDilated = (fh - 1) * dilateH + 1; + I32 fwDilated = (fw - 1) * dilateW + 1; + I32 ohow = oh * ow; + I32 fhfw = fh * fw; + I32 iw_pad = iw + paddingL + paddingR; + I32 ih_pad = ih + paddingT + paddingB; + + // infer kernel params + ConvController convCtl; + convCtl.ostepC16 = oh * ow * SIMDW * 4; + convCtl.fStep = ih_pad * iw_pad * SIMDW; + convCtl.kw = fw; + convCtl.kh = fh; + convCtl.scale = nullptr; + convCtl.stride = strideW; + + // fuse dw+pw + F32 *useOutArray = (F32 *)tmp; + if (pwFilterArray == nullptr) { + useOutArray = (F32 *)outArray; + } + F32 *output = (F32 *)useOutArray; + + const kernelFunc kernel[3] = { + Avx512DepthConvKernel1x16, Avx512DepthConvKernel8x16, Avx512DepthConvKernel16x16}; + U32 hwSizes[3] = {1, 8, 16}; + + // quantization + F32 *scaleI = scale; + F32 *scaleO = scale + 1; + F32 *scaleF = scale + 2; + if (idt != DT_U8_Q) { + //quantize to U8_Q + TensorDesc qDesc = inputDesc; + qDesc.dt = DT_U8_Q; + CHECK_STATUS(quantize_x86(inputDesc, (void *)inArray, &qDesc, tmp, scaleI)); + inArray = (UINT8 *)tmp; + tmp = (void *)((U8 *)tmp + tensorNumBytes(qDesc)); + } + *scaleO = scaleI[0] * scaleF[0]; + if (odt != DT_F32 && odt != DT_I32) { + output = (F32 *)tmp; + tmp = (void *)((U8 *)tmp + tensorNumElements(outputDesc) * bytesOf(DT_I32)); + outputDesc.dt = DT_I32; + } + if (eltwiseInput != nullptr) { + outputDesc.dt = DT_F32; + } + F32 *factorPtr = nullptr; + F32 factor = 0; + if (scale != nullptr && outputDesc.dt == DT_F32) { + factor = 1 / (*scaleO); + factorPtr = &factor; + } + + I32 *offsetC = (I32 *)tmp; + tmp = (void *)((U8 *)tmp + oc * bytesOf(DT_I32)); + CHECK_STATUS(quantize_bias_offsetC((const void *)dwBiasArray, dwBiasDesc, DT_I32, + (const void *)dwFilterArray, dwFilterDesc, scaleO, offsetC)); + dwFilterArray += oc * 4; + + U32 kernelSize = (fh * fw + 3) / 4 * 4; + convCtl.k4Num = kernelSize / 4; + UINT8 *tmpInput = (UINT8 *)tmp; + + I64 flags = 0; + flags |= (eltwiseInput != nullptr) << 1; + flags |= U32(depthwiseActivationParamSpec.mode) << 2; + convCtl.scale = factorPtr; + convCtl.flags = flags; + + for (I32 n = 0; n < in; ++n) { + I32 ocSize = 16; + // Padding + for (I32 ocb = 0; ocb < oc; ocb += ocSize) { + convCtl.bias = offsetC + ocb; + F32 *curO = output + (n * oc + ocb) * oh * ow; + I32 hwSize = 0; + UINT8 *curI = inArray + (n * ic + ocb) * ih * iw; + for (I32 hw = 0; hw < ohow; hw += hwSize) { + hwSize = UNI_MIN(ohow - hw, 16); + hwSize = hwSizes[hwSize >> 3]; + I32 h = hw / ow; + I32 w = hw % ow; + I32 in_h_0 = h * strideH; + I32 in_w_0 = w * strideW; + + // TODO: optimize + for (U32 kk = 0; kk < kernelSize; kk += 4) { + for (I32 ii = 0; ii < hwSize; ++ii) { + for (I32 jj = 0; jj < SIMDW; ++jj) { + for (I32 k4 = 0; k4 < 4; ++k4) { + I32 oidx = k4 + jj * 4 + ii * 4 * SIMDW + kk * SIMDW * hwSize; + if ((k4 + kk) < fhfw) { + in_h_0 = (hw + ii) / ow * strideH + (kk + k4) / fw; + in_w_0 = (hw + ii) % ow * strideW + (kk + k4) % fw; + I32 iidx = jj + (in_h_0 * iw + in_w_0) * SIMDW; + tmpInput[oidx] = curI[iidx]; + } else { + tmpInput[oidx] = 0; + } + } + } + } + } + + convCtl.input = tmpInput; + convCtl.output = curO + (h * ow + w) * SIMDW; + convCtl.filter = dwFilterArray + ocb * kernelSize; + convCtl.hStep = hwSize * SIMDW * 4; + kernel[hwSize >> 3](convCtl); + } + } + } + + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/int8/depthwise_convolution_transform.cpp b/compute/tensor/src/cpu/x86/int8/depthwise_convolution_transform.cpp new file mode 100644 index 00000000..22c118e3 --- /dev/null +++ b/compute/tensor/src/cpu/x86/int8/depthwise_convolution_transform.cpp @@ -0,0 +1,99 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/int8/tensor_computing_int8.h" +#include "cpu/x86/int8/transform_functions_int8.h" + +EE depthwise_convolution_transform_filter_int8( + TensorDesc filterDesc, const INT8 *filter, TensorDesc *ftmDesc, INT8 *filterTransformed) +{ + DataFormat ftmDataFormat = DF_NCHWN8HW4; // for flag, actually DF_NCHWN16HW4 + + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + if (fdf == ftmDataFormat) { + *ftmDesc = filterDesc; + UNI_MEMCPY(filterTransformed, filter, fn * fc * fh * fw * bytesOf(fdt)); + return SUCCESS; + } + if (fdf != DF_NCHW) { + CHECK_STATUS(NOT_SUPPORTED); + } + filterDesc = tensor4df(fdt, fdf, fc, 1, fh, fw); + *ftmDesc = tensor4df(fdt, ftmDataFormat, fc, 1, fh, fw); + + U32 fhfw = fh * fw; + U32 fhfwAligned = (fhfw + 3) / 4 * 4; + + I32 *offsetC = (I32 *)filterTransformed; + filterTransformed += fc * bytesOf(DT_I32); + for (U32 n = 0; n < fc; ++n) { + I32 sum = 0; + for (U32 i = 0; i < fh * fw; ++i) { + sum += filter[i + n * fh * fw]; + } + offsetC[n] = -128 * sum; + } + + for (U32 n = 0; n < fn; ++n) { + for (U32 c = 0; c < fc; c += 16) { + for (U32 hw = 0; hw < fhfwAligned; hw += 4) { + U32 c16; + for (c16 = 0; (c16 < 16) && (c16 < (fc - c)); ++c16) { + U32 w4; + for (w4 = 0; (w4 < 4) && (w4 < (fhfw - hw)); ++w4) { + U32 iidx = n * c * fhfw + (c + c16) * fhfw + hw + w4; + U32 oidx = n * c * fhfwAligned + c * fhfwAligned + hw * 16 + 4 * c16 + w4; + filterTransformed[oidx] = filter[iidx]; + } + for (; w4 < 4; ++w4) { + filterTransformed[n * c * fhfwAligned + c * fhfwAligned + hw * 16 + + 4 * c16 + w4] = 0; + } + } + for (; c16 < 16; ++c16) { + UNI_MEMSET( + filterTransformed + n * c * fhfw + c * fhfw + hw * 16 + c16 * 4, 0, 4); + } + } + } + } + + return SUCCESS; +} + +EE depthwise_pointwise_convolution_transform_filter_int8(TensorDesc dwFilterDesc, + const INT8 *dwFilter, + TensorDesc pwFilterDesc, + const INT8 *pwFilter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *dwFtmDesc, + INT8 *dwFilterTransformed, + TensorDesc *pwFtmDesc, + INT8 *pwFilterTransformed) +{ + EE ret = depthwise_convolution_transform_filter_int8( + dwFilterDesc, dwFilter, dwFtmDesc, dwFilterTransformed); + CHECK_STATUS(ret); + if (pwFilter == nullptr) { + return ret; + } + + ConvolutionParamSpec p = createConvolutionParamSpec(1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, + 1, pwFilterDesc.dims[pwFilterDesc.nDims - 1], CONVOLUTION_POINTWISE); + ret = convolution_transform_filter_int8( + pwFilterDesc, pwFilter, p, CONVOLUTION_ALGORITHM_POINTWISE, pwFtmDesc, pwFilterTransformed); + return ret; +} diff --git a/compute/tensor/src/cpu/x86/int8/lstm.cpp b/compute/tensor/src/cpu/x86/int8/lstm.cpp new file mode 100644 index 00000000..ead97669 --- /dev/null +++ b/compute/tensor/src/cpu/x86/int8/lstm.cpp @@ -0,0 +1,189 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/int8/tensor_computing_int8.h" +#include "cpu/x86/fp32/x86_functions_fp32.h" +#include "cpu/x86/fp32/mvm_nkn32.h" +#include "cpu/tensor_computing_cpu.h" +#include "blas_enhance.h" + +EE lstmcell_int8(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + F32 *scale, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output, + Arch arch) +{ + UNUSED(biasDesc); + UNUSED(tmpBytes); + UNUSED(arch); + if (nullptr == filter || nullptr == bias || nullptr == state || nullptr == tmp || + nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + + DataType idt, fdt, odt; + DataFormat idf, fdf, odf; + U32 in, ix; + U32 on, oh; + U32 fk, fn; + CHECK_STATUS(tensor2dGet(xDesc, &idt, &idf, &in, &ix)); + CHECK_STATUS(tensor2dGet(filterDesc[0], &fdt, &fdf, &fn, &fk)); + CHECK_STATUS(tensor2dGet(hDesc, &odt, &odf, &on, &oh)); + if (fdf != DF_NKN32) { + CHECK_STATUS(NOT_MATCH); + } + fn /= 32; + + U32 batch = in; + I32 xDim = ix; + I32 hDim = rnnParamSpec.num_outputs; + I32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection + : rnnParamSpec.num_outputs; + int num1 = rnnParamSpec.bi_direction ? 2 : 1; + U32 steps = batchStrideH / hDim / num1; + if (!(idt == DT_F32 && fdt == DT_F32 && odt == DT_F32)) { + CHECK_STATUS(NOT_MATCH); + } + if (!(4 * column == (I32)fn * 32 && (ix + oh) == fk && in == on)) { + CHECK_STATUS(NOT_MATCH); + } + F32 forgetBias = rnnParamSpec.forget_bias; + if (rnnParamSpec.activation_type != ACTIVATION_TANH) { + CHECK_STATUS(NOT_SUPPORTED); + } + + const F32 *currentXArray = (const F32 *)currentX; + F32 *lastStateArray = (F32 *)state; + F32 *lastHArray = lastStateArray + column; + F32 *tmpArray = (F32 *)tmp; + F32 *currentStateArray = (F32 *)state; + F32 *currentHArray = currentStateArray + column; + F32 *outputArray = (F32 *)output; + F32 *xhArray = tmpArray; + F32 *intermediateH = xhArray + (xDim + hDim); + UINT8 *quant = (UINT8 *)(intermediateH + fn * 32); + U32 lastStateStride = column + hDim; + U32 lastHStride = column + hDim; + U32 currentStateStride = column + hDim; + U32 currentHStride = column + hDim; + __m256 forgetBiasVector = _mm256_set1_ps(forgetBias); + for (U32 m = 0; m < batch; m++) { + F32 *lastBatchH = lastHArray + m * lastHStride; + if (xDim > 0) { + UNI_MEMCPY(xhArray, currentXArray + m * batchStrideX, xDim * sizeof(F32)); + UNI_MEMCPY(xhArray + xDim, lastBatchH, hDim * sizeof(F32)); + } else { + intermediateH = tmpArray; + xhArray = lastBatchH; + } + const F32 *mBias = (const F32 *)bias[0] + m * steps * column * 4; + + TensorDesc aDesc = tensor2df(DT_I8, targetFormat4mvmMatrix(DT_I8), fn * 32, fk); + TensorDesc b0Desc = tensor1d(DT_F32, fk); + TensorDesc b1Desc = tensor1d(DT_U8_Q, fk); + TensorDesc cDesc = tensor1d(DT_F32, fn * 32); + F32 iScale = -1, fScale = *scale; + CHECK_STATUS(quantize_cpu(b0Desc, xhArray, &b1Desc, quant, &iScale, arch)); + F32 oScale = iScale * fScale; + UNI_MEMSET(intermediateH, 0, sizeof(F32) * fn * 32); + CHECK_STATUS(matrix_vector_multiply(aDesc, filter[0], b1Desc, quant, tmpBytes, + (void *)filter[0], cDesc, intermediateH, &oScale, arch)); + array_add_f32(intermediateH, mBias, intermediateH, fn * 32); + + F32 *out_i = intermediateH; + F32 *out_g = out_i + column; + F32 *out_f = out_i + column * 2; + F32 *out_o = out_i + column * 3; + + F32 *lastBatchState = lastStateArray + m * lastStateStride; + F32 *currentBatchState = currentStateArray + m * currentStateStride; + F32 *currentBatchH = currentHArray + m * currentHStride; + F32 *currentOutput = outputArray + m * batchStrideH; + + F32 *tmpState, *tmpHH, *tmpH; + if (rnnParamSpec.zoneout_cell == 0) { + tmpState = currentBatchState; + } else { + tmpState = out_i; + } + if (rnnParamSpec.num_projection > 0) { + tmpHH = out_g; + tmpH = currentOutput; + } else { + tmpHH = currentOutput; + tmpH = out_g; + } + + I32 h = 0; + for (; h < column - 7; h += 8) { + __m256 out_i_v = _mm256_loadu_ps(out_i + h); + __m256 out_g_v = _mm256_loadu_ps(out_g + h); + __m256 out_f_v = _mm256_loadu_ps(out_f + h); + __m256 out_o_v = _mm256_loadu_ps(out_o + h); + __m256 C_v = _mm256_loadu_ps(lastBatchState + h); + __m256 I_v = _mm256_sigmod_ps(out_i_v); + __m256 F_v = _mm256_sigmod_ps(_mm256_add_ps(out_f_v, forgetBiasVector)); + __m256 O_v = _mm256_sigmod_ps(out_o_v); + __m256 G_v = _mm256_tanh_ps(out_g_v); + C_v = _mm256_add_ps(_mm256_mul_ps(C_v, F_v), _mm256_mul_ps(I_v, G_v)); + __m256 out_hidden_v = _mm256_mul_ps(O_v, _mm256_tanh_ps(C_v)); + _mm256_storeu_ps(tmpState + h, C_v); + _mm256_storeu_ps(tmpHH + h, out_hidden_v); + } + for (; h < column; h++) { + F32 C_s = lastBatchState[h]; + F32 I_s = 1.0 / (1.0 + exp(-out_i[h])); + F32 F_s = 1.0 / (1.0 + exp(-(out_f[h] + forgetBias))); + F32 O_s = 1.0 / (1.0 + exp(-out_o[h])); + F32 G_s = tanh(out_g[h]); + C_s = C_s * F_s + I_s * G_s; + F32 value = O_s * tanh(C_s); + tmpState[h] = C_s; + tmpHH[h] = value; + } + if (rnnParamSpec.zoneout_cell != 0) { + array_scale_f32(tmpState, tmpState, column, 1 - rnnParamSpec.zoneout_cell, 0); + array_scale_f32(lastBatchState, lastBatchState, column, rnnParamSpec.zoneout_cell, 0); + array_add_f32(tmpState, lastBatchState, currentBatchState, column); + } + + if (rnnParamSpec.num_projection > 0) { + mvm_nkn32_with_bias(hDim / 32, rnnParamSpec.num_projection, (const F32 *)filter[1], + tmpHH, tmpH, nullptr); + } + + if (rnnParamSpec.zoneout_output != 0) { + if (rnnParamSpec.num_projection > 0) { + array_scale_f32(tmpH, out_f, hDim, 1 - rnnParamSpec.zoneout_output, 0); + } else { + array_scale_f32(tmpHH, out_f, hDim, 1 - rnnParamSpec.zoneout_output, 0); + } + array_scale_f32(lastBatchH, lastBatchH, hDim, rnnParamSpec.zoneout_output, 0); + array_add_f32(out_f, lastBatchH, currentBatchH, hDim); + } else { + UNI_MEMCPY(currentBatchH, currentOutput, sizeof(F32) * hDim); + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/int8/pooling_int8.cpp b/compute/tensor/src/cpu/x86/int8/pooling_int8.cpp new file mode 100644 index 00000000..094dd16d --- /dev/null +++ b/compute/tensor/src/cpu/x86/int8/pooling_int8.cpp @@ -0,0 +1,429 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/fp32/tensor_computing_fp32.h" + +#define UNROLL_W 4 + +typedef void (*pooling_max_func)( + const UINT8 *curI, UINT8 *curO, U32 kw, U32 kh, U32 iStep, U32 stride); +typedef void (*pooling_mean_func)( + const UINT8 *curI, UINT8 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, I32 poolSize); + +void pooling_c16_max_w4(const UINT8 *curI, UINT8 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) +{ + __asm__ __volatile__("mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov %%eax, %%eax \n\t" + "mov %5, %%eax \n\t" + "mov %%rax, %%r9 \n\t" + "add %%r9, %%r9 \n\t" + "mov %%rax, %%r10 \n\t" + "add %%r9, %%r10 \n\t" + "add %0, %%rax \n\t" + "add %0, %%r9 \n\t" + "add %0, %%r10 \n\t" + + "vmovups (%0), %%xmm0 \n\t" + "vmovups (%%rax), %%xmm1 \n\t" + "vmovups (%%r9), %%xmm2 \n\t" + "vmovups (%%r10), %%xmm3 \n\t" + + ".align 16 \n\t" + "0: \n\t" + + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vmovups (%0), %%xmm4 \n\t" + "vmovups (%%rax), %%xmm5 \n\t" + "vmovups (%%r9), %%xmm6 \n\t" + "vmovups (%%r10), %%xmm7 \n\t" + + "vpmaxub %%xmm0, %%xmm4, %%xmm0 \n\t" + "vpmaxub %%xmm1, %%xmm5, %%xmm1 \n\t" + "vpmaxub %%xmm2, %%xmm6, %%xmm2 \n\t" + "vpmaxub %%xmm3, %%xmm7, %%xmm3 \n\t" + + "add $0x10, %0 \n\t" + "add $0x10, %%rax \n\t" + "add $0x10, %%r9 \n\t" + "add $0x10, %%r10 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + + "add %%rdi, %0 \n\t" + "add %%rdi, %%rax \n\t" + "add %%rdi, %%r9 \n\t" + "add %%rdi, %%r10 \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + + "vmovups %%xmm0, (%1) \n\t" + "vmovups %%xmm1, 0x10(%1) \n\t" + "vmovups %%xmm2, 0x20(%1) \n\t" + "vmovups %%xmm3, 0x30(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) + : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%xmm0", "%xmm1", "%xmm2", + "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "memory", "cc"); +} + +void pooling_c16_max_w2(const UINT8 *curI, UINT8 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) +{ + __asm__ __volatile__( + "mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov %%eax, %%eax \n\t" + "mov %5, %%eax \n\t" + "add %0, %%rax \n\t" + "vmovups (%0), %%xmm0 \n\t" + "vmovups (%%rax), %%xmm1 \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + "vmovups (%0), %%xmm4 \n\t" + "vmovups (%%rax), %%xmm5 \n\t" + "vpmaxub %%xmm0, %%xmm4, %%xmm0 \n\t" + "vpmaxub %%xmm1, %%xmm5, %%xmm1 \n\t" + "add $0x10, %0 \n\t" + "add $0x10, %%rax \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + "add %%rdi, %0 \n\t" + "add %%rdi, %%rax \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + "vmovups %%xmm0, (%1) \n\t" + "vmovups %%xmm1, 0x10(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) + : "%eax", "%rax", "%ecx", "%rdi", "%xmm0", "%xmm1", "%xmm4", "%xmm5", "memory", "cc"); +} + +void pooling_c16_max_w1(const UINT8 *curI, UINT8 *curO, U32 kw, U32 kh, U32 iStep, U32 stride) +{ + __asm__ __volatile__("mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "vmovups (%0), %%xmm0 \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + "vmovups (%0), %%xmm4 \n\t" + "vpmaxub %%xmm0, %%xmm4, %%xmm0 \n\t" + "add $0x10, %0 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + "add %%rdi, %0 \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + "vmovups %%xmm0, (%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride) + : "%eax", "%rax", "%ecx", "%rdi", "%xmm0", "%xmm4", "memory", "cc"); +} + +void pooling_c16_mean_w4( + const UINT8 *curI, UINT8 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, I32 poolSize) +{ + __asm__ __volatile__( + "mov $-128, %%eax \n\t" + "imul %%ebx, %%eax \n\t" + "imul %2, %%eax \n\t" + "vmovd %%eax, %%xmm0 \n\t" + "vpbroadcastd %%xmm0, %%zmm10 \n\t" + "vpbroadcastd %%xmm0, %%zmm11 \n\t" + "vpbroadcastd %%xmm0, %%zmm12 \n\t" + "vpbroadcastd %%xmm0, %%zmm13 \n\t" + "mov $0x80, %%eax \n\t" + "vmovd %%eax, %%xmm1 \n\t" + "vpbroadcastb %%xmm1, %%xmm8 \n\t" + "mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov %5, %%eax \n\t" + "mov %%rax, %%r9 \n\t" + "add %%r9, %%r9 \n\t" + "mov %%rax, %%r10 \n\t" + "add %%r9, %%r10 \n\t" + "add %0, %%rax \n\t" + "add %0, %%r9 \n\t" + "add %0, %%r10 \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + "vmovups (%0), %%xmm4 \n\t" + "vmovups (%%rax), %%xmm5 \n\t" + "vmovups (%%r9), %%xmm6 \n\t" + "vmovups (%%r10), %%xmm7 \n\t" + "vpmovzxbd %%xmm4, %%zmm0 \n\t" + "vpmovzxbd %%xmm5, %%zmm1 \n\t" + "vpmovzxbd %%xmm6, %%zmm2 \n\t" + "vpmovzxbd %%xmm7, %%zmm3 \n\t" + "vpaddd %%zmm10, %%zmm0, %%zmm10 \n\t" + "vpaddd %%zmm11, %%zmm1, %%zmm11 \n\t" + "vpaddd %%zmm12, %%zmm2, %%zmm12 \n\t" + "vpaddd %%zmm13, %%zmm3, %%zmm13 \n\t" + "add $0x10, %0 \n\t" + "add $0x10, %%rax \n\t" + "add $0x10, %%r9 \n\t" + "add $0x10, %%r10 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + "add %%rdi, %0 \n\t" + "add %%rdi, %%rax \n\t" + "add %%rdi, %%r9 \n\t" + "add %%rdi, %%r10 \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + "vbroadcastss (%6), %%zmm0 \n\t" + "vpmulld %%zmm0, %%zmm10, %%zmm10 \n\t" + "vpmulld %%zmm0, %%zmm11, %%zmm11 \n\t" + "vpmulld %%zmm0, %%zmm12, %%zmm12 \n\t" + "vpmulld %%zmm0, %%zmm13, %%zmm13 \n\t" + "vpsrld $16, %%zmm10, %%zmm10 \n\t" + "vpsrld $16, %%zmm11, %%zmm11 \n\t" + "vpsrld $16, %%zmm12, %%zmm12 \n\t" + "vpsrld $16, %%zmm13, %%zmm13 \n\t" + "mov $128, %%eax \n\t" + "vmovd %%eax, %%xmm0 \n\t" + "vpbroadcastd %%xmm0, %%zmm4 \n\t" + "vpaddd %%zmm10, %%zmm4, %%zmm10 \n\t" + "vpaddd %%zmm11, %%zmm4, %%zmm11 \n\t" + "vpaddd %%zmm12, %%zmm4, %%zmm12 \n\t" + "vpaddd %%zmm13, %%zmm4, %%zmm13 \n\t" + "vpmovusdb %%zmm10, (%1) \n\t" + "vpmovusdb %%zmm11, 0x10(%1) \n\t" + "vpmovusdb %%zmm12, 0x20(%1) \n\t" + "vpmovusdb %%zmm13, 0x30(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize) + : "%eax", "%rax", "%ecx", "%r10", "%r9", "%rdi", "%zmm0", "%zmm1", "%zmm2", "%zmm3", "%zmm4", + "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm10", "%zmm11", "%zmm12", "%zmm13", "memory", "cc"); +} + +void pooling_c16_mean_w2( + const UINT8 *curI, UINT8 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, I32 poolSize) +{ + __asm__ __volatile__( + "mov $-128, %%eax \n\t" + "imul %%ebx, %%eax \n\t" + "imul %2, %%eax \n\t" + "vmovd %%eax, %%xmm0 \n\t" + "vpbroadcastd %%xmm0, %%zmm10 \n\t" + "vpbroadcastd %%xmm0, %%zmm11 \n\t" + "mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov %5, %%eax \n\t" + "add %0, %%rax \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + "vmovups (%0), %%xmm4 \n\t" + "vmovups (%%rax), %%xmm5 \n\t" + "vpmovzxbd %%xmm4, %%zmm0 \n\t" + "vpmovzxbd %%xmm5, %%zmm1 \n\t" + "vpaddd %%zmm10, %%zmm0, %%zmm10 \n\t" + "vpaddd %%zmm11, %%zmm1, %%zmm11 \n\t" + "add $0x10, %0 \n\t" + "add $0x10, %%rax \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + "add %%rdi, %0 \n\t" + "add %%rdi, %%rax \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + "vbroadcastss (%6), %%zmm0 \n\t" + "vpmulld %%zmm0, %%zmm10, %%zmm10 \n\t" + "vpmulld %%zmm0, %%zmm11, %%zmm11 \n\t" + "vpsrld $16, %%zmm10, %%zmm10 \n\t" + "vpsrld $16, %%zmm11, %%zmm11 \n\t" + "mov $128, %%eax \n\t" + "vmovd %%eax, %%xmm0 \n\t" + "vpbroadcastd %%xmm0, %%zmm4 \n\t" + "vpaddd %%zmm10, %%zmm4, %%zmm10 \n\t" + "vpaddd %%zmm11, %%zmm4, %%zmm11 \n\t" + "vpmovusdb %%zmm10, (%1) \n\t" + "vpmovusdb %%zmm11, 0x10(%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize) + : "%eax", "%rax", "%ecx", "%rdi", "%zmm0", "%zmm1", "%zmm4", "%zmm5", "%zmm8", "%zmm10", + "%zmm11", "memory", "cc"); +} + +void pooling_c16_mean_w1( + const UINT8 *curI, UINT8 *curO, U32 kw, U32 kh, U32 iStep, U32 stride, I32 poolSize) +{ + __asm__ __volatile__( + "mov %%eax, %%eax \n\t" + "mov %4, %%eax \n\t" + "mov %%rax, %%rdi \n\t" + "mov $-128, %%eax \n\t" + "imul %%ebx, %%eax \n\t" + "imul %2, %%eax \n\t" + "vmovd %%eax, %%xmm0 \n\t" + "vpbroadcastd %%xmm0, %%zmm10 \n\t" + ".align 16 \n\t" + "0: \n\t" + "mov %2, %%ecx \n\t" + ".align 16 \n\t" + "1: \n\t" + "vmovups (%0), %%xmm4 \n\t" + "vpmovzxbd %%xmm4, %%zmm0 \n\t" + "vpaddd %%zmm10, %%zmm0, %%zmm10 \n\t" + "add $0x10, %0 \n\t" + "dec %%ecx \n\t" + "jg 1b \n\t" + "add %%rdi, %0 \n\t" + "dec %%ebx \n\t" + "jg 0b \n\t" + "vbroadcastss (%6), %%zmm0 \n\t" + "vpmulld %%zmm0, %%zmm10, %%zmm10 \n\t" + "mov $128, %%eax \n\t" + "vmovd %%eax, %%xmm0 \n\t" + "vpbroadcastd %%xmm0, %%zmm4 \n\t" + "vpsrld $16, %%zmm10, %%zmm10 \n\t" + "vpaddd %%zmm10, %%zmm4, %%zmm10 \n\t" + "vpmovusdb %%zmm10, (%1) \n\t" + : + : "r"(curI), "r"(curO), "r"(kw), "b"(kh), "r"(iStep), "r"(stride), "r"(&poolSize) + : "%eax", "%rax", "%ecx", "%rdi", "%zmm0", "%zmm1", "%zmm4", "%zmm8", "%zmm10", "memory", + "cc"); +} + +EE pooling_c16_uint8(TensorDesc inputDesc, + const UINT8 *input, + PoolingParamSpec p, + TensorDesc outputDesc, + UINT8 *output, + void *scale) +{ + if (nullptr == input || nullptr == output) { + CHECK_STATUS(NULL_POINTER); + } + DataType idt, odt; + DataFormat idf, odf; + U32 in = 0, ic = 0, ih = 0, iw = 0, on = 0, oc = 0, oh = 0, ow = 0; + CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow)); + + if (idt != odt || idt != DT_U8_Q) { + CHECK_STATUS(NOT_MATCH); + } + if (in != on || ic != oc) { + CHECK_STATUS(NOT_MATCH); + } + if (idf != DF_NCHWC16 || odf != idf) { + CHECK_STATUS(NOT_MATCH); + } + + PoolingMode pm = p.mode; + U32 strideH = p.stride_h; + U32 strideW = p.stride_w; + U32 paddingT = p.pad_top; + U32 paddingL = p.pad_left; + U32 kernelSizeH = p.kernel_h; + U32 kernelSizeW = p.kernel_w; + U32 wSize, kh, kw, iStep; + UINT8 *curO; + const UINT8 *curI; + if (paddingT >= kernelSizeH || paddingL >= kernelSizeW) { + CHECK_STATUS(NOT_SUPPORTED); + } + + if (ic % 16 != 0) { + CHECK_STATUS(NOT_MATCH); + } + + F32 *inputScale = (F32 *)scale; + F32 *outputScale = inputScale + 1; + I32 shift = 65536; + I32 factor = shift / (kernelSizeH * kernelSizeW); + if (factor < 1) { + CHECK_STATUS(NOT_SUPPORTED); + } + if (pm == POOLING_MAX) { + *outputScale = *inputScale; + } else { + *outputScale = *inputScale * factor * (kernelSizeW * kernelSizeH) / (F32)shift; + } + + ic /= 16; + U32 owInter = (iw + paddingL - kernelSizeW) / strideW + 1; + U32 wSizes[3] = {1, 2, 4}; + pooling_max_func pooling_max[3] = {pooling_c16_max_w1, pooling_c16_max_w2, pooling_c16_max_w4}; + pooling_mean_func pooling_mean[3] = { + pooling_c16_mean_w1, pooling_c16_mean_w2, pooling_c16_mean_w4}; + F32 poolSize = shift / (kernelSizeH * kernelSizeW); + for (U32 n = 0; n < in; n++) { + for (U32 c = 0; c < ic; c++) { + for (U32 h = 0; h < oh; h++) { + for (U32 w = 0; w < ow; w += wSize) { + if (w < owInter) { + wSize = UNI_MIN(owInter - w, UNROLL_W); + } else { + wSize = 1; + } + wSize = wSizes[wSize >> 1]; + int hstart = (int)h * (int)strideH - (int)paddingT; + int wstart = (int)w * (int)strideW - (int)paddingL; + int hend = UNI_MIN(hstart + kernelSizeH, ih); + int wend = UNI_MIN(wstart + kernelSizeW, iw); + hstart = UNI_MAX(hstart, 0); + wstart = UNI_MAX(wstart, 0); + + curI = input + (hstart * iw + wstart) * 16; + curO = output + (h * ow + w) * 16; + kh = hend - hstart; + kw = wend - wstart; + iStep = (iw - kw) * 16; + if (!p.count_include_pad) { + poolSize = shift / (kh * kw); + } + if (kw < kernelSizeW) { + wSize = 1; + } + switch (pm) { + case POOLING_MAX: { + pooling_max[wSize >> 1](curI, curO, kw, kh, iStep, strideW * 16); + break; + } + case POOLING_MEAN: { + pooling_mean[wSize >> 1]( + curI, curO, kw, kh, iStep, strideW * 16, poolSize); + break; + } + default: + return NOT_SUPPORTED; + } + } + } + input += ih * iw * 16; + output += oh * ow * 16; + } + } + return SUCCESS; +} diff --git a/compute/tensor/src/cpu/x86/int8/quantize.cpp b/compute/tensor/src/cpu/x86/int8/quantize.cpp index 5910545b..6b8af3e0 100644 --- a/compute/tensor/src/cpu/x86/int8/quantize.cpp +++ b/compute/tensor/src/cpu/x86/int8/quantize.cpp @@ -20,6 +20,7 @@ inline void getSymmetricQuantizeScale(U32 num16, U32 resMask, const F32 *data, F32 *scale) { + F32 maxVal = 0; __asm__ __volatile__("vxorps %%zmm0, %%zmm0, %%zmm0 \n\t" "mov $0x7FFFFFFF, %%ebx \n\t" "vmovd %%ebx, %%xmm1 \n\t" @@ -55,17 +56,20 @@ inline void getSymmetricQuantizeScale(U32 num16, U32 resMask, const F32 *data, F "vmaxps %%xmm1, %%xmm0, %%xmm0 \n\t" "vpermilps $0b00000001, %%xmm0, %%xmm1 \n\t" "vmaxps %%xmm1, %%xmm0, %%xmm0 \n\t" - "mov $0x42FE0000, %%ebx \n\t" - "vmovd %%ebx, %%xmm1 \n\t" - "vdivps %%xmm0, %%xmm1, %%xmm2 \n\t" - "vmovss %%xmm2, (%1) \n\t" - : "+r"(data), "+r"(scale) + "vmovd %%xmm0, %1 \n\t" + : "+r"(data), "+r"(maxVal) : "r"(num16), "a"(resMask) : "%k2", "%ebx", "%zmm0", "%zmm1", "%zmm2", "memory", "cc"); + if (maxVal == 0) { + *scale = 1; + } else { + *scale = 127 / maxVal; + } } inline void getSymmetricQuantizeScaleI32(U32 num16, U32 resMask, const I32 *data, F32 *scale) { + F32 maxVal = 0; __asm__ __volatile__("vxorps %%zmm0, %%zmm0, %%zmm0 \n\t" "mov %2, %%ebx \n\t" "cmp $0x0, %%ebx \n\t" @@ -98,14 +102,16 @@ inline void getSymmetricQuantizeScaleI32(U32 num16, U32 resMask, const I32 *data "vpmaxsd %%xmm1, %%xmm0, %%xmm0 \n\t" "vpermilps $0b00000001, %%xmm0, %%xmm1 \n\t" "vpmaxsd %%xmm1, %%xmm0, %%xmm0 \n\t" - "mov $0x42FE0000, %%ebx \n\t" - "vmovd %%ebx, %%xmm1 \n\t" "vcvtdq2ps %%xmm0, %%xmm0 \n\t" - "vdivps %%xmm0, %%xmm1, %%xmm2 \n\t" - "vmovss %%xmm2, (%1) \n\t" - : "+r"(data), "+r"(scale) + "vmovd %%xmm0, %1 \n\t" + : "+r"(data), "+r"(maxVal) : "r"(num16), "a"(resMask) : "%k2", "%ebx", "%zmm0", "%zmm1", "%zmm2", "memory", "cc"); + if (maxVal == 0) { + *scale = 1; + } else { + *scale = 127 / maxVal; + } } EE quantizeF32ToU8(TensorDesc dDesc, const F32 *data, TensorDesc *qDesc, UINT8 *qData, F32 *scale) @@ -223,7 +229,7 @@ EE quantizeF32ToI8(TensorDesc dDesc, const F32 *data, TensorDesc *qDesc, INT8 *q return SUCCESS; } -EE quantizeBiasOffsetCI32(F32 *bias, +EE quantizeBiasOffsetCI32(const F32 *bias, TensorDesc biasDesc, INT8 *filter, TensorDesc filterDesc, @@ -233,17 +239,20 @@ EE quantizeBiasOffsetCI32(F32 *bias, U32 N = tensorNumElements(biasDesc); std::set nativeFormat = {DF_NCHW, DF_NHWC, DF_MTK, DF_NORMAL, DF_TRANSPOSE}; I32 *offsetC = (I32 *)filter; - if (bias == nullptr || N == 0) { + if ((bias == nullptr) && (filter == nullptr)) { + return SUCCESS; + } + if ((bias == nullptr) || (N == 0)) { N = UNI_MAX(filterDesc.dims[0], filterDesc.dims[1]); if (nativeFormat.count(filterDesc.df)) { - memset(offsetCBias, 0, N * bytesOf(DT_I32)); + UNI_MEMSET(offsetCBias, 0, N * bytesOf(DT_I32)); } else { - memcpy(offsetCBias, offsetC, N * bytesOf(DT_I32)); + UNI_MEMCPY(offsetCBias, offsetC, N * bytesOf(DT_I32)); } return SUCCESS; } - if (nativeFormat.count(filterDesc.df)) { + if ((filter == nullptr) || nativeFormat.count(filterDesc.df)) { for (U32 i = 0; i < N; ++i) { offsetCBias[i] = round(bias[i] * scale[0]); } @@ -259,7 +268,12 @@ EE transformU8ToI8(TensorDesc dDesc, const UINT8 *data, TensorDesc *qDesc, INT8 { U32 dataNum = tensorNumElements(dDesc); U32 num16 = dataNum / 64; - I64 resMask = pow(2, dataNum % 64) - 1; + U64 resMask = dataNum % 64; + if (resMask == 63) { + resMask = 0xFFFFFFFFFFFFFFFF; + } else { + resMask = (1LL << resMask) - 1; + } __asm__ __volatile__("mov $0x80, %%ebx \n\t" "vmovd %%ebx, %%xmm1 \n\t" diff --git a/compute/tensor/src/cpu/x86/int8/rnn.cpp b/compute/tensor/src/cpu/x86/int8/rnn.cpp new file mode 100644 index 00000000..ee9fed99 --- /dev/null +++ b/compute/tensor/src/cpu/x86/int8/rnn.cpp @@ -0,0 +1,44 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "cpu/x86/int8/tensor_computing_int8.h" + +EE rnncell_int8(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + F32* scale, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output, + Arch arch) +{ + EE ret = NOT_SUPPORTED; + switch (rnnParamSpec.mode) { + case RNN_LSTM: { + ret = lstmcell_int8(xDesc, currentX, filterDesc, filter, biasDesc, bias, scale, state, + tmpBytes, tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, output, arch); + break; + } + default: + break; + } + return ret; +} diff --git a/compute/tensor/src/cpu/x86/int8/tensor_computing_int8.h b/compute/tensor/src/cpu/x86/int8/tensor_computing_int8.h index 4639d703..0ec53c5c 100644 --- a/compute/tensor/src/cpu/x86/int8/tensor_computing_int8.h +++ b/compute/tensor/src/cpu/x86/int8/tensor_computing_int8.h @@ -21,7 +21,7 @@ EE dequantizeI32ToF32(TensorDesc qDesc, I32 *qData, const F32 *scale, TensorDesc dDesc, F32 *data); -EE quantizeBiasOffsetCI32(F32 *bias, +EE quantizeBiasOffsetCI32(const F32 *bias, TensorDesc biasDesc, INT8 *filter, TensorDesc filterDesc, @@ -40,12 +40,13 @@ EE quantizeI32ToI8(TensorDesc dDesc, const I32 *data, TensorDesc *qDesc, INT8 *q EE convolution_int8(TensorDesc inputDesc, UINT8 *input, + F32 *eltwiseInput, TensorDesc filterDesc, const INT8 *filter, ConvolutionParamSpec convParamSpec, ConvolutionForwardAlgorithm algorithm, TensorDesc biasDesc, - const I32 *bias, + const F32 *bias, U32 tmpBytes, void *tmp, TensorDesc outputDesc, @@ -56,11 +57,12 @@ EE convolution_int8(TensorDesc inputDesc, EE convolution_direct(TensorDesc inputDesc, UINT8 *inArray, + F32 *eltwiseInput, TensorDesc filterDesc, const INT8 *filterArray, ConvolutionParamSpec convParamSpec, TensorDesc biasDesc, - const I32 *biasArray, + const F32 *biasArray, U32 tmpBytes, void *tmp, TensorDesc outputDesc, @@ -84,11 +86,12 @@ EE convolution_infer_forward_tmp_bytes_int8(TensorDesc inputDesc, EE convolution_1x1_direct(TensorDesc inputDesc, UINT8 *inArray, + F32 *eltwiseInput, TensorDesc filterDesc, const INT8 *filterArray, ConvolutionParamSpec convParamSpec, TensorDesc biasDesc, - const I32 *biasArray, + const F32 *biasArray, U32 tmpBytes, void *tmp, TensorDesc outputDesc, @@ -96,4 +99,79 @@ EE convolution_1x1_direct(TensorDesc inputDesc, F32 *scale, ActivationParamSpec activationDesc); -#endif //CHEETAH_TENSOR_COMPUTING_INT8_H \ No newline at end of file +EE pooling_c16_uint8(TensorDesc inputDesc, + const UINT8 *input, + PoolingParamSpec poolingParamSpec, + TensorDesc outputDesc, + UINT8 *output, + void *scale); + +EE rnncell_int8(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + F32 *scale, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output, + Arch arch); + +EE lstmcell_int8(TensorDesc xDesc, + const void *currentX, + const TensorDesc *filterDesc, + const void **filter, + const TensorDesc *biasDesc, + const void **bias, + F32 *scale, + void *state, + U32 tmpBytes, + void *tmp, + RNNParamSpec rnnParamSpec, + U32 batchStrideX, + U32 batchStrideH, + TensorDesc hDesc, + void *output, + Arch arch); + +EE depthwise_pointwise_convolution_int8(TensorDesc inputDesc, + UINT8 *inArray, + F32 *eltwiseInput, + TensorDesc dwFilterDesc, + const INT8 *dwFilterArray, + TensorDesc pwFilterDesc, + const INT8 *pwFilterArray, + ConvolutionParamSpec convParamSpec, + TensorDesc dwBiasDesc, + const F32 *dwBiasArray, + TensorDesc pwBiasDesc, + const F32 *pwBiasArray, + U32 tmpBytes, + void *tmp, + TensorDesc outputDesc, + void *outArray, + F32 *scale, + ActivationParamSpec depthwiseActivationParamSpec, + ActivationParamSpec pointwiseActivationParamSpec); + +EE depthwise_convolution_transform_filter_int8(TensorDesc filterDesc, + const INT8 *filter, + TensorDesc *ftmDesc, + INT8 *filterTransformed); + +EE depthwise_pointwise_convolution_transform_filter_int8(TensorDesc dwFilterDesc, + const INT8 *dwFilter, + TensorDesc pwFilterDesc, + const INT8 *pwFilter, + DepthwiseConvolutionForwardAlgorithm algorithm, + TensorDesc *dwFtmDesc, + INT8 *dwFilterTransformed, + TensorDesc *pwFtmDesc, + INT8 *pwFilterTransformed); +#endif //CHEETAH_TENSOR_COMPUTING_INT8_H diff --git a/compute/tensor/src/cpu/x86/int8/transform_functions_int8.h b/compute/tensor/src/cpu/x86/int8/transform_functions_int8.h index 6196d16d..29ef416f 100644 --- a/compute/tensor/src/cpu/x86/int8/transform_functions_int8.h +++ b/compute/tensor/src/cpu/x86/int8/transform_functions_int8.h @@ -24,10 +24,10 @@ inline void PaddingNCHWC16( DataFormat idf; U32 in, ic, ih, iw; CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 padih = paddingT + paddingB + ih; U32 padiw = paddingL + paddingR + iw; @@ -38,8 +38,8 @@ inline void PaddingNCHWC16( U32 icNum = ic / 16; for (U32 c = 0; c < icNum; ++c) { U32 coff = c * padih * padiw * simdW; - memset(tmp + coff, 128, padiw * paddingT * simdW); - memset(tmp + coff + (ih + paddingT) * padiw * simdW, 128, padiw * paddingB * simdW); + UNI_MEMSET(tmp + coff, 128, padiw * paddingT * simdW); + UNI_MEMSET(tmp + coff + (ih + paddingT) * padiw * simdW, 128, padiw * paddingB * simdW); } for (U32 hc = 0; hc < ih * icNum; ++hc) { U32 c = hc / ih; @@ -47,10 +47,10 @@ inline void PaddingNCHWC16( U32 h = hc % ih; U32 hoff = (h + paddingT) * padiw; - memset(tmp + coff + hoff * simdW, 128, paddingL * simdW); - memcpy(tmp + coff + (hoff + paddingL) * simdW, data + c * ih * iw * simdW + h * iw * simdW, - iw * simdW); - memset(tmp + coff + (hoff + (paddingL + iw)) * simdW, 128, paddingR * simdW); + UNI_MEMSET(tmp + coff + hoff * simdW, 128, paddingL * simdW); + UNI_MEMCPY(tmp + coff + (hoff + paddingL) * simdW, + data + c * ih * iw * simdW + h * iw * simdW, iw * simdW); + UNI_MEMSET(tmp + coff + (hoff + (paddingL + iw)) * simdW, 128, paddingR * simdW); } icNum *= 16; @@ -58,14 +58,14 @@ inline void PaddingNCHWC16( while (resC > 0) { U32 cx = (resC == 12) ? 8 : resC; // resC: 4, 8, 12, 16 U32 coff = icNum * padih * padiw; - memset(tmp + coff, 128, padiw * paddingT * cx); - memset(tmp + coff + (ih + paddingT) * padiw * cx, 128, padiw * paddingB * cx); + UNI_MEMSET(tmp + coff, 128, padiw * paddingT * cx); + UNI_MEMSET(tmp + coff + (ih + paddingT) * padiw * cx, 128, padiw * paddingB * cx); for (U32 h = 0; h < ih; ++h) { U32 hoff = (h + paddingT) * padiw; - memset(tmp + coff + hoff * cx, 128, paddingL * cx); - memcpy( + UNI_MEMSET(tmp + coff + hoff * cx, 128, paddingL * cx); + UNI_MEMCPY( tmp + coff + (hoff + paddingL) * cx, data + icNum * ih * iw + h * iw * cx, iw * cx); - memset(tmp + coff + (hoff + (paddingL + iw)) * cx, 128, paddingR * cx); + UNI_MEMSET(tmp + coff + (hoff + (paddingL + iw)) * cx, 128, paddingR * cx); } resC -= cx; } @@ -79,10 +79,10 @@ inline void PaddingNCHW2NCHWC16( DataFormat idf; U32 in, ic, ih, iw; CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 padih = paddingT + paddingB + ih; U32 padiw = paddingL + paddingR + iw; @@ -92,8 +92,8 @@ inline void PaddingNCHW2NCHWC16( U32 icNum = ic / 16; for (U32 c = 0; c < icNum; ++c) { U32 coff = c * padih * padiw * simdW; - memset(tmp + coff, 128, padiw * paddingT * simdW); - memset(tmp + coff + (ih + paddingT) * padiw * simdW, 128, padiw * paddingB * simdW); + UNI_MEMSET(tmp + coff, 128, padiw * paddingT * simdW); + UNI_MEMSET(tmp + coff + (ih + paddingT) * padiw * simdW, 128, padiw * paddingB * simdW); } for (U32 hc = 0; hc < ih * icNum; ++hc) { U32 c = hc / ih; @@ -101,7 +101,7 @@ inline void PaddingNCHW2NCHWC16( U32 h = hc % ih; U32 hoff = (h + paddingT) * padiw; - memset(tmp + coff + hoff * simdW, 128, paddingL * simdW); + UNI_MEMSET(tmp + coff + hoff * simdW, 128, paddingL * simdW); for (U32 w = 0; w < iw; ++w) { for (U32 s = 0; s < simdW; ++s) { U32 iIdx = (c * simdW + s) * ih * iw + h * iw + w; @@ -109,7 +109,7 @@ inline void PaddingNCHW2NCHWC16( tmp[oIdx] = data[iIdx]; } } - memset(tmp + coff + (hoff + (paddingL + iw)) * simdW, 128, paddingR * simdW); + UNI_MEMSET(tmp + coff + (hoff + (paddingL + iw)) * simdW, 128, paddingR * simdW); } icNum *= 16; @@ -118,11 +118,11 @@ inline void PaddingNCHW2NCHWC16( U32 icx = ic - icNum; U32 cx = (resC == 12) ? 8 : resC; // resC: 4, 8, 12, 16 U32 coff = icNum * padih * padiw; - memset(tmp + coff, 128, padiw * paddingT * cx); - memset(tmp + coff + (ih + paddingT) * padiw * cx, 128, padiw * paddingB * cx); + UNI_MEMSET(tmp + coff, 128, padiw * paddingT * cx); + UNI_MEMSET(tmp + coff + (ih + paddingT) * padiw * cx, 128, padiw * paddingB * cx); for (U32 h = 0; h < ih; ++h) { U32 hoff = (h + paddingT) * padiw; - memset(tmp + coff + hoff * cx, 128, paddingL * cx); + UNI_MEMSET(tmp + coff + hoff * cx, 128, paddingL * cx); for (U32 w = 0; w < iw; ++w) { U32 woff = (hoff + paddingL) * cx + w * cx; for (U32 s = 0; s < icx; ++s) { @@ -130,9 +130,9 @@ inline void PaddingNCHW2NCHWC16( U32 oIdx = coff + woff + s; tmp[oIdx] = data[iIdx]; } - memset(tmp + coff + woff + icx, 128, cx - icx); + UNI_MEMSET(tmp + coff + woff + icx, 128, cx - icx); } - memset(tmp + coff + (hoff + (paddingL + iw)) * cx, 128, paddingR * cx); + UNI_MEMSET(tmp + coff + (hoff + (paddingL + iw)) * cx, 128, paddingR * cx); } resC -= cx; } @@ -146,10 +146,10 @@ inline void PaddingNCHWC8ToNCHWC16( DataFormat idf; U32 in, ic, ih, iw; CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 padih = paddingT + paddingB + ih; U32 padiw = paddingL + paddingR + iw; @@ -161,8 +161,8 @@ inline void PaddingNCHWC8ToNCHWC16( if (paddingT != 0 || paddingB != 0) { for (U32 c = 0; c < icNum; ++c) { U32 coff = c * padih * padiw * simdW; - memset(tmp + coff, 128, padiw * paddingT * simdW); - memset(tmp + coff + (ih + paddingT) * padiw * simdW, 128, padiw * paddingB * simdW); + UNI_MEMSET(tmp + coff, 128, padiw * paddingT * simdW); + UNI_MEMSET(tmp + coff + (ih + paddingT) * padiw * simdW, 128, padiw * paddingB * simdW); } } for (U32 hc = 0; hc < ih * icNum; ++hc) { @@ -171,32 +171,32 @@ inline void PaddingNCHWC8ToNCHWC16( U32 h = hc % ih; U32 hoff = (h + paddingT) * padiw; - memset(tmp + coff + hoff * simdW, 128, paddingL * simdW); + UNI_MEMSET(tmp + coff + hoff * simdW, 128, paddingL * simdW); for (U32 w = 0; w < iw; ++w) { for (U32 s = 0; s < simdW; s += 8) { U32 iIdx = (c * simdW + s) * ih * iw + (h * iw + w) * 8; U32 oIdx = coff + (hoff + paddingL) * simdW + w * simdW + s; - memcpy(tmp + oIdx, data + iIdx, 8); + UNI_MEMCPY(tmp + oIdx, data + iIdx, 8); } } - memset(tmp + coff + (hoff + (paddingL + iw)) * simdW, 128, paddingR * simdW); + UNI_MEMSET(tmp + coff + (hoff + (paddingL + iw)) * simdW, 128, paddingR * simdW); } icNum *= 16; if (ic > icNum) { U32 cx = 8; U32 coff = icNum * padih * padiw; - memset(tmp + coff, 128, padiw * paddingT * cx); - memset(tmp + coff + (ih + paddingT) * padiw * cx, 128, padiw * paddingB * cx); + UNI_MEMSET(tmp + coff, 128, padiw * paddingT * cx); + UNI_MEMSET(tmp + coff + (ih + paddingT) * padiw * cx, 128, padiw * paddingB * cx); for (U32 h = 0; h < ih; ++h) { U32 hoff = (h + paddingT) * padiw; - memset(tmp + coff + hoff * cx, 128, paddingL * cx); + UNI_MEMSET(tmp + coff + hoff * cx, 128, paddingL * cx); for (U32 w = 0; w < iw; ++w) { U32 iIdx = icNum * ih * iw + (h * iw + w) * 8; U32 oIdx = coff + (hoff + paddingL) * cx + w * cx; - memcpy(tmp + oIdx, data + iIdx, 8); + UNI_MEMCPY(tmp + oIdx, data + iIdx, 8); } - memset(tmp + coff + (hoff + (paddingL + iw)) * cx, 128, paddingR * cx); + UNI_MEMSET(tmp + coff + (hoff + (paddingL + iw)) * cx, 128, paddingR * cx); } } } diff --git a/compute/tensor/src/cpu/x86/int8/x86_functions_int8.h b/compute/tensor/src/cpu/x86/int8/x86_functions_int8.h index 79bb1c1b..1b50aeb5 100644 --- a/compute/tensor/src/cpu/x86/int8/x86_functions_int8.h +++ b/compute/tensor/src/cpu/x86/int8/x86_functions_int8.h @@ -24,9 +24,7 @@ inline EE activation_offset_int8( { U32 num32 = len / 32; U32 resMask = pow(2, len % 32) - 1; - EE ret = SUCCESS; - switch (activationDesc.mode) { case ACTIVATION_NULL: { break; diff --git a/compute/tensor/src/cpu/x86/normalization.cpp b/compute/tensor/src/cpu/x86/normalization.cpp index aaf9f160..3cb55145 100644 --- a/compute/tensor/src/cpu/x86/normalization.cpp +++ b/compute/tensor/src/cpu/x86/normalization.cpp @@ -16,8 +16,13 @@ #include "cpu/x86/fp32/tensor_computing_fp32.h" #endif -EE layer_normalization_x86( - TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output) +EE layer_normalization_x86(TensorDesc inputDesc, + void *input, + LayerNormParamSpec p, + void *alpha, + void *beta, + TensorDesc outputDesc, + void *output) { DataType idt = inputDesc.dt; EE ret = SUCCESS; @@ -25,7 +30,7 @@ EE layer_normalization_x86( #ifdef _USE_FP32 case DT_F32: { ret = layer_normalization_fp32( - inputDesc, (F32 *)input, (F32 *)alpha, (F32 *)beta, outputDesc, (F32 *)output); + inputDesc, (F32 *)input, p, (F32 *)alpha, (F32 *)beta, outputDesc, (F32 *)output); break; } #endif diff --git a/compute/tensor/src/cpu/x86/pooling.cpp b/compute/tensor/src/cpu/x86/pooling.cpp index 9b7c8d95..b7ca2bdf 100644 --- a/compute/tensor/src/cpu/x86/pooling.cpp +++ b/compute/tensor/src/cpu/x86/pooling.cpp @@ -15,11 +15,14 @@ #ifdef _USE_FP32 #include "cpu/x86/fp32/tensor_computing_fp32.h" #endif +#ifdef _USE_INT8 +#include "cpu/x86/int8/tensor_computing_int8.h" +#endif EE pooling_x86(TensorDesc inputDesc, const void *input, PoolingParamSpec poolingParamSpec, - const void *scale, + void *scale, TensorDesc outputDesc, void *output) { @@ -34,6 +37,20 @@ EE pooling_x86(TensorDesc inputDesc, } else if (inputDesc.df == DF_NCHWC16) { ret = pooling_c16_fp32( inputDesc, (const F32 *)input, poolingParamSpec, outputDesc, (F32 *)output); + } else if (inputDesc.df == DF_NCHW) { + ret = pooling_nchw_fp32( + inputDesc, (const F32 *)input, poolingParamSpec, outputDesc, (F32 *)output); + } else { + ret = NOT_SUPPORTED; + } + break; + } +#endif +#ifdef _USE_INT8 + case DT_U8_Q: { + if (inputDesc.df == DF_NCHWC16) { + ret = pooling_c16_uint8(inputDesc, (const UINT8 *)input, poolingParamSpec, + outputDesc, (UINT8 *)output, scale); } else { ret = NOT_SUPPORTED; } @@ -67,4 +84,4 @@ EE pooling_bp_x86(TensorDesc inputDesc, break; } return ret; -} \ No newline at end of file +} diff --git a/compute/tensor/src/cpu/x86/quantize.cpp b/compute/tensor/src/cpu/x86/quantize.cpp index f2b5e732..4dbe0d8d 100644 --- a/compute/tensor/src/cpu/x86/quantize.cpp +++ b/compute/tensor/src/cpu/x86/quantize.cpp @@ -87,8 +87,26 @@ EE quantize_bias_offsetC(const void *bias, switch (qType) { #ifdef _USE_INT8 case DT_I32: { - ret = quantizeBiasOffsetCI32( - (F32 *)bias, biasDesc, (INT8 *)filter, filterDesc, scale, (I32 *)offsetCBias); + ret = quantizeBiasOffsetCI32((const F32 *)bias, biasDesc, (INT8 *)filter, + filterDesc, scale, (I32 *)offsetCBias); + break; + } +#endif + default: + ret = NOT_SUPPORTED; + break; + } + } else if (biasDesc.dt == DT_I32) { + switch (qType) { +#ifdef _USE_INT8 + case DT_I32: { + if (filter == nullptr) { + UNI_MEMCPY(offsetCBias, bias, tensorNumBytes(biasDesc)); + } else { + for (U32 i = 0; i < tensorNumElements(biasDesc); ++i) { + ((I32 *)offsetCBias)[i] = ((I32 *)bias)[i] + ((I32 *)filter)[i]; + } + } break; } #endif diff --git a/compute/tensor/src/cpu/x86/rnn.cpp b/compute/tensor/src/cpu/x86/rnn.cpp index 7e9ce1d2..e4e247f0 100644 --- a/compute/tensor/src/cpu/x86/rnn.cpp +++ b/compute/tensor/src/cpu/x86/rnn.cpp @@ -15,7 +15,9 @@ #ifdef _USE_FP32 #include "cpu/x86/fp32/tensor_computing_fp32.h" #endif -#include "blas_enhance.h" +#ifdef _USE_INT8 +#include "cpu/x86/int8/tensor_computing_int8.h" +#endif EE rnncell_x86(TensorDesc xDesc, const void *currentX, @@ -23,6 +25,7 @@ EE rnncell_x86(TensorDesc xDesc, const void **filter, const TensorDesc *biasDesc, const void **bias, + float *scale, void *state, U32 tmpBytes, void *tmp, @@ -33,17 +36,25 @@ EE rnncell_x86(TensorDesc xDesc, void *output, Arch arch) { - EE ret = SUCCESS; + EE ret = NOT_SUPPORTED; switch (xDesc.dt) { #ifdef _USE_FP32 case DT_F32: { - ret = rnncell_fp32(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, tmpBytes, - tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, output, arch); + if (0) { +#if defined(_USE_INT8) && defined(_USE_ULTRA_OPTIMIZATION) + } else if (arch == X86_AVX512 && rnnParamSpec.mode == RNN_LSTM && + rnnParamSpec.num_projection == 0) { + ret = rnncell_int8(xDesc, currentX, filterDesc, filter, biasDesc, bias, scale, state, + tmpBytes, tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, output, arch); +#endif + } else { + ret = rnncell_fp32(xDesc, currentX, filterDesc, filter, biasDesc, bias, state, + tmpBytes, tmp, rnnParamSpec, batchStrideX, batchStrideH, hDesc, output, arch); + } break; } #endif default: - ret = NOT_SUPPORTED; break; } return ret; diff --git a/compute/tensor/src/cpu/x86/scale.cpp b/compute/tensor/src/cpu/x86/scale.cpp index 6c7ded30..00d8c9a3 100644 --- a/compute/tensor/src/cpu/x86/scale.cpp +++ b/compute/tensor/src/cpu/x86/scale.cpp @@ -12,6 +12,7 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "cpu/x86/tensor_computing_x86.h" +#include "cpu/x86/int32/tensor_computing_int32.h" #ifdef _USE_FP32 #include "cpu/x86/fp32/tensor_computing_fp32.h" #endif @@ -37,7 +38,7 @@ EE scale_x86(TensorDesc inputDesc, CHECK_REQUIREMENT(oc % 16 == 0); axis = outputDesc.nDims + 1; } - EE ret = SUCCESS; + EE ret = NOT_SUPPORTED; switch (outputDesc.dt) { #ifdef _USE_FP32 case DT_F32: { @@ -46,10 +47,13 @@ EE scale_x86(TensorDesc inputDesc, break; } #endif + case DT_I32: { + ret = scale_int32((I32 *)input, axis, outputDesc.nDims, (I32 *)alpha, (I32 *)beta, on, + oc, elements_per_channel, ic, (I32 *)output); + break; + } default: - ret = NOT_SUPPORTED; break; } - return ret; } diff --git a/compute/tensor/src/cpu/x86/softmax.cpp b/compute/tensor/src/cpu/x86/softmax.cpp index 9c2a37f0..da00fcda 100644 --- a/compute/tensor/src/cpu/x86/softmax.cpp +++ b/compute/tensor/src/cpu/x86/softmax.cpp @@ -19,9 +19,8 @@ EE softmax_x86( TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output) { - DataType idt = inputDesc.dt; - EE ret = SUCCESS; - switch (idt) { + EE ret = NOT_SUPPORTED; + switch (inputDesc.dt) { #ifdef _USE_FP32 case DT_F32: { ret = softmax_fp32(inputDesc, (const F32 *)input, p.axis, outputDesc, (F32 *)output); @@ -29,9 +28,24 @@ EE softmax_x86( } #endif default: - ret = NOT_SUPPORTED; break; } + return ret; +} +EE logsoftmax_x86( + TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output) +{ + EE ret = NOT_SUPPORTED; + switch (inputDesc.dt) { +#ifdef _USE_FP32 + case DT_F32: { + ret = logsoftmax_fp32(inputDesc, (const F32 *)input, p.axis, outputDesc, (F32 *)output); + break; + } +#endif + default: + break; + } return ret; } diff --git a/compute/tensor/src/cpu/x86/tensor_computing_x86.h b/compute/tensor/src/cpu/x86/tensor_computing_x86.h index 05f4cef0..16f3fcf0 100644 --- a/compute/tensor/src/cpu/x86/tensor_computing_x86.h +++ b/compute/tensor/src/cpu/x86/tensor_computing_x86.h @@ -102,6 +102,7 @@ EE depthwise_pointwise_convolution_x86(TensorDesc inputDesc, const void *pwFilter, ConvolutionParamSpec convParamSpec, DepthwiseConvolutionForwardAlgorithm algorithm, + void *scale, TensorDesc dwBiasDesc, const void *dwBias, TensorDesc pwBiasDesc, @@ -124,6 +125,7 @@ EE depthwise_convolution_transform_filter_x86(TensorDesc filterDesc, void *filterTransformed); EE depthwise_convolution_infer_forward_tmp_bytes_x86(TensorDesc inputDesc, + TensorDesc dwFilterDesc, TensorDesc outputDesc, ConvolutionParamSpec convParamSpec, DepthwiseConvolutionForwardAlgorithm algorithm, @@ -135,6 +137,7 @@ EE depthwise_convolution_x86(TensorDesc inputDesc, const void *filter, ConvolutionParamSpec convParamSpec, DepthwiseConvolutionForwardAlgorithm algorithm, + void *scale, TensorDesc biasDesc, const void *bias, U32 tmpBytes, @@ -152,8 +155,13 @@ EE eltwise_x86(DataType dataType, void *output, EltwiseMode eltwiseMode); -EE layer_normalization_x86( - TensorDesc inputDesc, void *input, void *alpha, void *beta, TensorDesc outputDesc, void *output); +EE layer_normalization_x86(TensorDesc inputDesc, + void *input, + LayerNormParamSpec p, + void *alpha, + void *beta, + TensorDesc outputDesc, + void *output); EE rnncell_x86(TensorDesc xDesc, const void *currentX, @@ -161,6 +169,7 @@ EE rnncell_x86(TensorDesc xDesc, const void **filter, const TensorDesc *biasDesc, const void **bias, + float *scale, void *state, U32 tmpBytes, void *tmp, @@ -182,7 +191,7 @@ EE scale_x86(TensorDesc inputDesc, EE pooling_x86(TensorDesc inputDesc, const void *input, PoolingParamSpec poolingParamSpec, - const void *scale, + void *scale, TensorDesc outputDesc, void *output); @@ -197,6 +206,9 @@ EE reshape_x86(TensorDesc inputDesc, void *input, TensorDesc outputDesc, void *o EE softmax_x86( TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output); +EE logsoftmax_x86( + TensorDesc inputDesc, const void *input, SoftmaxParamSpec p, TensorDesc outputDesc, void *output); + EE deconvolution_transform_filter_x86(TensorDesc filterDesc, const void *filter, ConvolutionForwardAlgorithm algorithm, diff --git a/compute/tensor/src/cpu/x86/x86_functions.h b/compute/tensor/src/cpu/x86/x86_functions.h index 7c93143f..e9353619 100644 --- a/compute/tensor/src/cpu/x86/x86_functions.h +++ b/compute/tensor/src/cpu/x86/x86_functions.h @@ -91,6 +91,9 @@ inline void array_power_x86(DataType dt, void *input, void *output, I32 len, F32 array_power_f32((F32 *)input, (F32 *)output, len, power); break; #endif + case DT_I64: + array_power_template((I64 *)input, (I64 *)output, len, power); + break; case DT_I32: array_power_template((I32 *)input, (I32 *)output, len, power); break; @@ -112,6 +115,10 @@ inline F32 array_sum_x86(DataType dt, const void *data, I32 len) result = array_sum_f32((const F32 *)data, len); break; #endif + case DT_U32: + case DT_I32: + result = array_sum_i32((const I32 *)data, len); + break; default: CHECK_STATUS(NOT_SUPPORTED); break; @@ -128,6 +135,9 @@ inline void array_scale_x86( array_scale_f32((const F32 *)input, (F32 *)output, len, alpha, beta); break; #endif + case DT_I64: + array_scale_template((const I64 *)input, (I64 *)output, len, alpha, beta); + break; case DT_I32: array_scale_template((const I32 *)input, (I32 *)output, len, alpha, beta); break; @@ -188,6 +198,9 @@ inline EE array_minmax_value_x86(DataType dt, const void *data, I32 len, int mod ret = array_minmax_value_f32((const F32 *)data, len, mode, result); break; #endif + case DT_U32: + ret = array_minmax_value_general(dt, data, len, mode, result); + break; case DT_I32: ret = array_minmax_value_i32((const I32 *)data, len, mode, result); break; diff --git a/compute/tensor/src/cpu/yolov3detectionoutput.cpp b/compute/tensor/src/cpu/yolov3detectionoutput.cpp index 966af6a3..1afe1f17 100644 --- a/compute/tensor/src/cpu/yolov3detectionoutput.cpp +++ b/compute/tensor/src/cpu/yolov3detectionoutput.cpp @@ -12,86 +12,9 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "cpu/tensor_computing_cpu.h" +#include "cpu/non_max_suppression.h" #include "tensor_transpose.h" -inline EE qsort_descent(std::vector &boxes, std::vector &scores, int left, int right) -{ - if (boxes.empty() || scores.empty()) { - return NOT_SUPPORTED; - } - - int i = left; - int j = right; - F32 temp = scores[(left + right) / 2]; - - while (i <= j) { - while (scores[i] > temp) { - i++; - } - while (scores[j] < temp) { - j--; - } - if (i <= j) { - std::swap(boxes[i], boxes[j]); - std::swap(scores[i], scores[j]); - i++; - j--; - } - } - - if (left < j) { - qsort_descent(boxes, scores, left, j); - } - if (i < right) { - qsort_descent(boxes, scores, i, right); - } - - return SUCCESS; -} - -inline F32 intersectionarea(BoxRect a, BoxRect b) -{ - if (a.xmin > b.xmax || a.xmax < b.xmin || a.ymin > b.ymax || a.ymax < b.ymin) { - return 0.f; - } - F32 inter_width = std::min(a.xmax, b.xmax) - std::max(a.xmin, b.xmin); - F32 inter_height = std::min(a.ymax, b.ymax) - std::max(a.ymin, b.ymin); - - return inter_width * inter_height; -} - -inline EE nms_pickedboxes(std::vector boxes, std::vector &picked, F32 nms_threshold) -{ - I64 n = boxes.size(); - - std::vector areas(n); - for (I64 i = 0; i < n; i++) { - BoxRect box = boxes[i]; - - F32 width = box.xmax - box.xmin; - F32 height = box.ymax - box.ymin; - - areas[i] = width * height; - } - for (I64 i = 0; i < n; i++) { - BoxRect a = boxes[i]; - int keep = 1; - for (int j = 0; j < (int)picked.size(); j++) { - BoxRect b = boxes[picked[j]]; - F32 inter_area = intersectionarea(a, b); - F32 union_area = areas[i] + areas[picked[j]] - inter_area; - - if (inter_area / union_area > nms_threshold) { - keep = 0; - } - } - if (keep) { - picked.push_back(i); - } - } - return SUCCESS; -} - template EE yolov3detectionoutput(std::vector input, T *output, @@ -123,7 +46,6 @@ EE yolov3detectionoutput(std::vector input, } std::vector all_boxrects; - std::vector all_boxscores; I64 input_size = inputDesc.size(); U32 info_per_box = 4 + 1 + num_class; ActivationParamSpec activationdesc_sigmoid; @@ -134,14 +56,11 @@ EE yolov3detectionoutput(std::vector input, CHECK_REQUIREMENT(inputDesc[i].df == DF_NCHWC8 || inputDesc[i].df == DF_NCHW); if (inputDesc[i].df == DF_NCHWC8) { T *tmp = (T *)malloc(tensorNumBytes(inputDesc[0])); - memcpy(tmp, in, tensorNumBytes(inputDesc[0])); + UNI_MEMCPY(tmp, in, tensorNumBytes(inputDesc[0])); CHECK_STATUS(transformToNCHW(inputDesc[0], tmp, inputDesc[0], in)); free(tmp); } - std::vector> allbox_boxrects; - std::vector> allbox_boxscores; - allbox_boxrects.resize(num_box); - allbox_boxscores.resize(num_box); + std::vector> allbox_boxrects(num_box); U32 w = inputDesc[i].dims[0]; U32 h = inputDesc[i].dims[1]; @@ -190,9 +109,9 @@ EE yolov3detectionoutput(std::vector input, F32 box_ymin = box_cy - box_h * 0.5; F32 box_xmax = box_cx + box_w * 0.5; F32 box_ymax = box_cy + box_h * 0.5; - BoxRect box = {box_xmin, box_ymin, box_xmax, box_ymax, label}; + BoxRect box = { + box_xmin, box_ymin, box_xmax, box_ymax, label, score_conf, INT_MAX}; allbox_boxrects[b].push_back(box); - allbox_boxscores[b].push_back(score_conf); } idx++; } @@ -202,34 +121,28 @@ EE yolov3detectionoutput(std::vector input, for (U32 b = 0; b < num_box; b++) { all_boxrects.insert( all_boxrects.end(), allbox_boxrects[b].begin(), allbox_boxrects[b].end()); - all_boxscores.insert( - all_boxscores.end(), allbox_boxscores[b].begin(), allbox_boxscores[b].end()); } } // sort boxes - qsort_descent(all_boxrects, all_boxscores, 0, static_cast(all_boxscores.size() - 1)); + std::stable_sort(all_boxrects.begin(), all_boxrects.end(), + [&](const BoxRect &a, const BoxRect &b) { return (a.score > b.score); }); // apply nms - std::vector picked; - nms_pickedboxes(all_boxrects, picked, nms_threshold); + std::vector picked = nms_pickedboxes(all_boxrects, nms_threshold); std::vector boxrects; - std::vector boxscores; - for (I64 p = 0; p < (I64)picked.size(); p++) { + for (U32 p = 0; p < picked.size(); p++) { I64 picked_box = picked[p]; boxrects.push_back(all_boxrects[picked_box]); - boxscores.push_back(all_boxscores[picked_box]); } - U32 num_detected = static_cast(boxrects.size()); + U32 num_detected = boxrects.size(); // the first box contains the number of availble boxes output[0] = num_detected; output[1] = output[2] = output[3] = output[4] = output[5] = 0; for (U32 i = 0; i < num_detected; i++) { BoxRect b = boxrects[i]; - F32 score = boxscores[i]; - output[(i + 1) * 6] = b.label + 1; - output[(i + 1) * 6 + 1] = score; + output[(i + 1) * 6 + 1] = b.score; output[(i + 1) * 6 + 2] = b.xmin; output[(i + 1) * 6 + 3] = b.ymin; output[(i + 1) * 6 + 4] = b.xmax; diff --git a/compute/tensor/src/cumsum.cpp b/compute/tensor/src/cumsum.cpp new file mode 100644 index 00000000..1cda9689 --- /dev/null +++ b/compute/tensor/src/cumsum.cpp @@ -0,0 +1,48 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_GENERAL +#include "cpu/general/tensor_computing_general.h" +#endif +#ifdef _USE_NEON +#include "cpu/arm/tensor_computing_arm.h" +#endif +#ifdef _USE_X86 +#include "cpu/x86/tensor_computing_x86.h" +#endif + +EE cumsum_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr || outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = inputDesc; + outputTensor->resize(outputDesc); + return SUCCESS; +} + +EE cumsum(Tensor inputTensor, CumSumParamSpec p, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { + ret = cumsum_general(inputDesc, input, p, outputDesc, output); + } + return ret; +} diff --git a/compute/tensor/src/deconvolution.cpp b/compute/tensor/src/deconvolution.cpp index 9d3256b8..78261c34 100644 --- a/compute/tensor/src/deconvolution.cpp +++ b/compute/tensor/src/deconvolution.cpp @@ -26,9 +26,6 @@ inline EE deconvolution_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc, DataType targetDataType) { - if (nullptr == outputDesc) { - CHECK_STATUS(NULL_POINTER); - } DataType idt, fdt; DataFormat idf, fdf; U32 in, ic, ih, iw; @@ -37,23 +34,22 @@ inline EE deconvolution_infer_output_size_cpu(TensorDesc inputDesc, CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); CHECK_STATUS(tensor4dGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); CHECK_REQUIREMENT(1 == fn || ic == fn); - if (fh < 1 || fw < 1) { - CHECK_STATUS(NOT_SUPPORTED); + return NOT_SUPPORTED; } U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - if (convParamSpec.rm == TF_SAME) { + if (convParamSpec.round_mode == ROUND_TF_SAME) { oh = strideH * ih; ow = strideW * iw; } else { - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; - oh = fh + strideH * (ih - 1) - paddingT - paddingB; - ow = fw + strideW * (iw - 1) - paddingL - paddingR; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; + oh = fh + strideH * (ih - 1) - paddingT - paddingB + convParamSpec.output_pad_h; + ow = fw + strideW * (iw - 1) - paddingL - paddingR + convParamSpec.output_pad_w; } *outputDesc = tensor4df(targetDataType, DF_NCHWC8, in, fc, oh, ow); @@ -67,32 +63,29 @@ EE deconvolution_infer_output_size(Tensor *inputTensor, DataType targetDataType, ArchInfo_t archInfo) { - if (inputTensor == nullptr) { - CHECK_STATUS(NULL_POINTER); - } - if (outputTensor == nullptr) { - CHECK_STATUS(NULL_POINTER); + if (inputTensor == nullptr || outputTensor == nullptr) { + return NULL_POINTER; } TensorDesc inputDesc = inputTensor->get_desc(); TensorDesc filterDesc = filterTensor.get_desc(); TensorDesc outputDesc = outputTensor->get_desc(); - CHECK_STATUS(deconvolution_infer_output_size_cpu( - inputDesc, filterDesc, convParamSpec, &outputDesc, targetDataType)); + EE ret = deconvolution_infer_output_size_cpu( + inputDesc, filterDesc, convParamSpec, &outputDesc, targetDataType); if (IS_GPU(archInfo->arch)) { #ifdef _USE_GPU OclMemory *inputMem = (OclMemory *)inputTensor->get_memory(); OclMemory *outputMem = (OclMemory *)outputTensor->get_memory(); - CHECK_STATUS(deconvolution_padding_input_mali( - inputDesc, filterDesc, convParamSpec, &outputDesc, inputMem, outputMem)); + ret = deconvolution_padding_input_mali( + inputDesc, filterDesc, convParamSpec, &outputDesc, inputMem, outputMem); #endif } else { U32 fc = filterDesc.dims[filterDesc.nDims - 2]; if (fc % 8 != 0) { - CHECK_STATUS(NOT_SUPPORTED); + ret = NOT_SUPPORTED; } } outputTensor->resize(outputDesc); - return SUCCESS; + return ret; } EE deconvolution_infer_forward_algorithm(Tensor inputTensor, @@ -108,7 +101,6 @@ EE deconvolution_infer_forward_algorithm(Tensor inputTensor, TensorDesc inputDesc = inputTensor.get_desc(); TensorDesc filterDesc = filterTensor.get_desc(); TensorDesc outputDesc = outputTensor.get_desc(); - EE ret = NOT_SUPPORTED; auto arch = archInfo->arch; if (IS_GENERAL(arch)) { diff --git a/compute/tensor/src/depth2space.cpp b/compute/tensor/src/depth2space.cpp index 06511b6e..2e764ecb 100644 --- a/compute/tensor/src/depth2space.cpp +++ b/compute/tensor/src/depth2space.cpp @@ -12,6 +12,9 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif #ifdef _USE_GPU #include "gpu/mali/tensor_computing_mali.h" #endif @@ -26,7 +29,7 @@ EE depth2space_infer_output_size( CHECK_STATUS(NULL_POINTER); } TensorDesc inputDesc = inputTensor->get_desc(); - TensorDesc outputDesc = outputTensor->get_desc(); + TensorDesc outputDesc = inputDesc; EE ret = NOT_SUPPORTED; if (IS_GPU(archInfo->arch)) { #ifdef _USE_GPU @@ -34,6 +37,13 @@ EE depth2space_infer_output_size( OclMemory *outputMem = (OclMemory *)outputTensor->get_memory(); ret = depth2space_padding_input_mali(inputDesc, p, &outputDesc, inputMem, outputMem); #endif + } else { + for (int i = 0; i < (int)outputDesc.nDims - 2; i++) { + outputDesc.dims[i] *= p.block_size; + outputDesc.dims[outputDesc.nDims - 2] /= p.block_size; + } + outputDesc.df = getTensorDefaultDataFormat(outputDesc.nDims); + ret = SUCCESS; } outputTensor->resize(outputDesc); return ret; @@ -49,6 +59,9 @@ EE depth2space_infer_forward_tmp_bytes( TensorDesc outputDesc = outputTensor.get_desc(); ret = depth2space_infer_tmpBuf_size_mali(inputDesc, p, outputDesc, bytes); #endif + } else { + *bytes = 0; + ret = SUCCESS; } return ret; } @@ -60,16 +73,20 @@ EE depth2space(Tensor inputTensor, ArchInfo_t archInfo) { auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); EE ret = NOT_SUPPORTED; if (IS_GPU(arch)) { #ifdef _USE_GPU - TensorDesc inputDesc = inputTensor.get_desc(); - void *input = get_ptr_from_tensor(inputTensor, arch); - void *tmp = get_ptr_from_tensor(tmpTensor, arch); - TensorDesc outputDesc = outputTensor.get_desc(); - void *output = get_ptr_from_tensor(outputTensor, arch); ret = depth2space_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, p, (GCLMem_t)tmp, outputDesc, (GCLMem_t)output); +#endif + } else { +#ifdef _USE_CPU + ret = depth2space_cpu(inputDesc, input, p, outputDesc, output); #endif } return ret; diff --git a/compute/tensor/src/depthwise_convolution.cpp b/compute/tensor/src/depthwise_convolution.cpp index 31895ff1..aac5fdf7 100644 --- a/compute/tensor/src/depthwise_convolution.cpp +++ b/compute/tensor/src/depthwise_convolution.cpp @@ -48,10 +48,10 @@ inline EE depthwise_convolution_infer_output_size_cpu(TensorDesc inputDesc, U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 dilateH = convParamSpec.dilatedRate_h; U32 dilateW = convParamSpec.dilatedRate_w; @@ -64,7 +64,12 @@ inline EE depthwise_convolution_infer_output_size_cpu(TensorDesc inputDesc, CHECK_STATUS(NOT_MATCH); } - *outputDesc = tensor4df(targetDataType, DF_NCHWC8, in, ic, oh, ow); + DataFormat odf = DF_NCHWC8; + if ((idt == DT_U8_Q || idf == DF_NCHWC16) && ic % 16 == 0) { + odf = DF_NCHWC16; + } + + *outputDesc = tensor4df(targetDataType, odf, in, ic, oh, ow); return SUCCESS; } @@ -227,9 +232,7 @@ EE depthwise_convolution_infer_forward_tmp_bytes(Tensor inputTensor, ArchInfo_t archInfo) { TensorDesc inputDesc = inputTensor.get_desc(); -#if defined(_USE_NEON) || defined(_USE_GPU) TensorDesc filterDesc = filterTensor.get_desc(); -#endif TensorDesc outputDesc = outputTensor.get_desc(); EE ret = NOT_SUPPORTED; @@ -242,7 +245,7 @@ EE depthwise_convolution_infer_forward_tmp_bytes(Tensor inputTensor, #ifdef _USE_X86 } else if (IS_X86(arch)) { ret = depthwise_convolution_infer_forward_tmp_bytes_x86( - inputDesc, outputDesc, convParamSpec, algorithm, bytes); + inputDesc, filterDesc, outputDesc, convParamSpec, algorithm, bytes); #endif #ifdef _USE_NEON } else if (IS_ARM(arch)) { @@ -263,6 +266,7 @@ EE depthwise_convolution(Tensor inputTensor, Tensor filterTensor, ConvolutionParamSpec convParamSpec, DepthwiseConvolutionForwardAlgorithm algorithm, + void *scale, Tensor biasTensor, Tensor tmpTensor, Tensor outputTensor, @@ -290,7 +294,7 @@ EE depthwise_convolution(Tensor inputTensor, #ifdef _USE_X86 } else if (IS_X86(arch)) { ret = depthwise_convolution_x86(inputDesc, input, filterDesc, filter, convParamSpec, - algorithm, biasDesc, bias, tmpBytes, tmp, outputDesc, output, + algorithm, scale, biasDesc, bias, tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, archInfo->arch); #endif #ifdef _USE_NEON diff --git a/compute/tensor/src/depthwise_pointwise_convolution.cpp b/compute/tensor/src/depthwise_pointwise_convolution.cpp index d34bc889..22d8cfc6 100644 --- a/compute/tensor/src/depthwise_pointwise_convolution.cpp +++ b/compute/tensor/src/depthwise_pointwise_convolution.cpp @@ -50,10 +50,10 @@ inline EE depthwise_pointwise_convolution_infer_output_size_cpu(TensorDesc input U32 strideH = convParamSpec.stride_h; U32 strideW = convParamSpec.stride_w; - U32 paddingT = convParamSpec.padding_top; - U32 paddingB = convParamSpec.padding_bottom; - U32 paddingL = convParamSpec.padding_left; - U32 paddingR = convParamSpec.padding_right; + U32 paddingT = convParamSpec.pad_top; + U32 paddingB = convParamSpec.pad_bottom; + U32 paddingL = convParamSpec.pad_left; + U32 paddingR = convParamSpec.pad_right; U32 dilateH = convParamSpec.dilatedRate_h; U32 dilateW = convParamSpec.dilatedRate_w; @@ -66,7 +66,12 @@ inline EE depthwise_pointwise_convolution_infer_output_size_cpu(TensorDesc input CHECK_STATUS(NOT_MATCH); } - *outputDesc = tensor4df(targetDataType, DF_NCHWC8, in, fn2, oh, ow); + DataFormat odf = DF_NCHWC8; + if ((idt == DT_U8_Q || idf == DF_NCHWC16) && ic % 16 == 0) { + odf = DF_NCHWC16; + } + + *outputDesc = tensor4df(targetDataType, odf, in, fn2, oh, ow); return SUCCESS; } @@ -103,6 +108,15 @@ EE depthwise_pointwise_convolution_infer_output_size(Tensor *inputTensor, if (fn % 8 != 0) { CHECK_STATUS(NOT_SUPPORTED); } +#ifdef _USE_INT8 + if (IS_X86_AVX512(archInfo->arch) && (inputDesc.dt == DT_U8_Q)) + { + outputDesc.df = DF_NCHWC16; + if (fn % 16 != 0) { + CHECK_STATUS(NOT_SUPPORTED); + } + } +#endif } outputTensor->resize(outputDesc); return SUCCESS; @@ -180,7 +194,17 @@ EE depthwise_pointwise_convolution_transform_filter_bytes(Tensor dwFilterTensor, #ifdef _USE_X86 } else if (IS_X86(arch)) { U32 *size = (U32 *)dwBytes; - *size = tensorNumBytes(dwFilterDesc) + 32; + if (DT_I8 == dwFilterDesc.dt) { + DataType fdt; + DataFormat fdf; + U32 fn, fc, fh, fw; + CHECK_STATUS(tensor4dGet(dwFilterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw)); + U32 alignSize = 4; + U32 filterSize = (fh * fw + alignSize - 1) / alignSize * alignSize; + *size = filterSize * fn * fc + 32 + fc * 4; + } else { + *size = tensorNumBytes(dwFilterDesc) + 32; + } size = (U32 *)pwBytes; *size = tensorNumBytes(pwFilterDesc) + 32; ret = SUCCESS; @@ -281,7 +305,7 @@ EE depthwise_pointwise_convolution_infer_forward_tmp_bytes(Tensor inputTensor, #ifdef _USE_X86 } else if (IS_X86(arch)) { ret = depthwise_convolution_infer_forward_tmp_bytes_x86( - inputDesc, outputDesc, convParamSpec, algorithm, bytes); + inputDesc, dwFilterDesc, outputDesc, convParamSpec, algorithm, bytes); #endif #ifdef _USE_NEON } else if (IS_ARM(arch)) { @@ -303,6 +327,7 @@ EE depthwise_pointwise_convolution(std::vector inputTensors, Tensor pwFilterTensor, ConvolutionParamSpec convParamSpec, DepthwiseConvolutionForwardAlgorithm algorithm, + void *scale, Tensor dwBiasTensor, Tensor pwBiasTensor, std::vector tmpTensors, @@ -358,7 +383,7 @@ EE depthwise_pointwise_convolution(std::vector inputTensors, #ifdef _USE_X86 } else if (IS_X86(arch)) { ret = depthwise_pointwise_convolution_x86(inputDesc, input, eltwiseInput, dwFilterDesc, - dwFilter, pwFilterDesc, pwFilter, convParamSpec, algorithm, dwBiasDesc, dwBias, + dwFilter, pwFilterDesc, pwFilter, convParamSpec, algorithm, scale, dwBiasDesc, dwBias, pwBiasDesc, pwBias, tmpBytes, tmp, outputDesc, output, depthwiseActivationParamSpec, pointwiseActivationParamSpec, archInfo->arch); #endif @@ -388,7 +413,7 @@ EE depthwise_pointwise_convolution(std::vector inputTensors, if (inputTensors.size() > 1 && isEltwiseSeperate) { std::vector eltwiseInputTensors = {outputTensor, inputTensors[1]}; EltwiseParamSpec eltwiseDesc; - eltwiseDesc.elt_mode = ELTWISE_SUM; + eltwiseDesc.mode = ELTWISE_SUM; eltwiseDesc.activation_type = eltwiseActDesc.mode; eltwiseDesc.activation_spec = convParamSpec.activation_spec; ret = eltwise(eltwiseInputTensors, eltwiseDesc, tmpTensors[0], outputTensor, archInfo); diff --git a/compute/tensor/src/eltwise.cpp b/compute/tensor/src/eltwise.cpp index ff892065..795db3ed 100644 --- a/compute/tensor/src/eltwise.cpp +++ b/compute/tensor/src/eltwise.cpp @@ -26,37 +26,24 @@ inline EE eltwise_infer_output_size_cpu(std::vector inputDesc, TensorDesc *outputDesc) { if (nullptr == outputDesc) { - CHECK_STATUS(NULL_POINTER); + return NULL_POINTER; } U32 num = inputDesc.size(); - if (num <= 0) { + if (num <= 1) { return NOT_MATCH; } - if (num == 1) { - *outputDesc = inputDesc[0]; - return SUCCESS; - } - U32 arrayDimMax = 0; - U32 minDims = inputDesc[0].nDims; for (U32 i = 1; i < num; i++) { if (inputDesc[i].nDims > inputDesc[arrayDimMax].nDims) { arrayDimMax = i; } - if (inputDesc[i].nDims < minDims) { - minDims = inputDesc[i].nDims; - } } U32 nchwc8Count = 0; U32 nchwc16Count = 0; U32 nhwcCount = 0; + bool sameDim = true; for (U32 i = 0; i < num; i++) { - // Output from 1D-conv + 3D tensors - //if (inputDesc[i].nDims == 4 && inputDesc[i].dims[0] == 1 && minDims == 3) { - // inputDesc[i] = tensor3df(inputDesc[i].dt, inputDesc[i].df, inputDesc[i].dims[3], - // inputDesc[i].dims[2], inputDesc[i].dims[1]); - //} if (inputDesc[i].df == DF_NCHWC8) { nchwc8Count++; } @@ -68,12 +55,13 @@ inline EE eltwise_infer_output_size_cpu(std::vector inputDesc, Tenso nhwcCount++; std::swap(inputDesc[i].dims[0], inputDesc[i].dims[1]); } + if (tensorNumElements(inputDesc[i]) != tensorNumElements(inputDesc[0])) { + sameDim = false; + } } - U32 dim = inputDesc[arrayDimMax].nDims; *outputDesc = inputDesc[arrayDimMax]; - - for (U32 i = 0; i < dim; i++) { + for (U32 i = 0; i < outputDesc->nDims; i++) { for (U32 j = 0; j < num; j++) { if (inputDesc[j].nDims > i) { int max_value = UNI_MAX(outputDesc->dims[i], inputDesc[j].dims[i]); @@ -92,13 +80,9 @@ inline EE eltwise_infer_output_size_cpu(std::vector inputDesc, Tenso if (nchwc16Count > 0 && nchwc16Count != num) { outputDesc->df = DF_NCHWC16; } - //if (nchwc8Count > 0 && nhwcCount > 0) { - // outputDesc->df = DF_NCHWC8; - // if (outputDesc->nDims == 3) { - // *outputDesc = tensor4df(outputDesc->dt, DF_NCHWC8, outputDesc->dims[2], - // outputDesc->dims[1], outputDesc->dims[0], 1); - // } - //} + if (!sameDim && (nchwc8Count > 0 || nchwc16Count > 0)) { + outputDesc->df = DF_NCHW; + } return SUCCESS; } @@ -167,8 +151,11 @@ EE eltwise(std::vector inputTensor, void *tmp = get_ptr_from_tensor(tmpTensor, arch); TensorDesc outputDesc = outputTensor.get_desc(); void *output = get_ptr_from_tensor(outputTensor, arch); + + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU #if defined(_USE_NEON) && defined(_USE_INT8) - if (!IS_GPU(arch)) { for (U32 i = 0; i < inputTensor.size(); i++) { if (inputDesc[i].dt == DT_I8) { F32 scale = inputTensor[i].get_scale(); @@ -182,12 +169,7 @@ EE eltwise(std::vector inputTensor, tmp = (U8 *)tmp + dTensor.bytes(); } } - } #endif - - EE ret = NOT_SUPPORTED; - if (IS_CPU(arch)) { -#ifdef _USE_CPU ret = eltwise_cpu(inputDesc, input, eltwiseDesc, tmpBytes, tmp, outputDesc, output, arch); #endif #ifdef _USE_GPU diff --git a/compute/tensor/src/embedding.cpp b/compute/tensor/src/embedding.cpp index d9ca84c2..98ebf7cc 100644 --- a/compute/tensor/src/embedding.cpp +++ b/compute/tensor/src/embedding.cpp @@ -44,7 +44,7 @@ EE embedding_infer_output_size(Tensor *inputTensor, } CHECK_REQUIREMENT(tensorIs2d(inputDesc)); CHECK_STATUS(tensor2dGet(inputDesc, &dt, &df, &batch, &step)); - outputDesc = tensor3df(outputDt, DF_MTK, batch, step, p.num_output); + outputDesc = tensor3df(outputDt, DF_MTK, batch, step, p.num_outputs); if (inputOneDim) { outputDesc.nDims = 2; outputDesc.df = DF_NORMAL; diff --git a/compute/tensor/src/equal.cpp b/compute/tensor/src/equal.cpp index ee129c48..50a86a43 100644 --- a/compute/tensor/src/equal.cpp +++ b/compute/tensor/src/equal.cpp @@ -10,6 +10,7 @@ // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#if 0 #include "tensor_computing.h" EE equal_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) @@ -22,9 +23,8 @@ EE equal_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t } // attention: comparision ptr will be fixed in mt -template -static EE equal_kernel( - U32 inputLen, U32 comparisonLen, T *inputPtr, F32 *comparisionPtr, bool not_equal, U8 *outputPtr) +template +static EE equal_kernel(T1 *a1, int len1, T2 *a2, int len2, bool not_equal, U8 *out) { U8 equal_flag, notequal_flag; if (not_equal) { @@ -34,27 +34,27 @@ static EE equal_kernel( equal_flag = 1; notequal_flag = 0; } - if (inputLen == comparisonLen) { - for (U32 i = 0; i < inputLen; ++i) { - if (inputPtr[i] == (T)(comparisionPtr[i])) { - outputPtr[i] = equal_flag; + EE ret = SUCCESS; + if (len1 == len2) { + for (int i = 0; i < len1; ++i) { + if (a1[i] == (T1)(a2[i])) { + out[i] = equal_flag; } else { - outputPtr[i] = notequal_flag; + out[i] = notequal_flag; } } - } else if (comparisonLen == 1) { - F32 compF = comparisionPtr[0]; - for (U32 i = 0; i < inputLen; ++i) { - if (inputPtr[i] == (T)compF) { - outputPtr[i] = equal_flag; + } else if (len2 == 1) { + for (int i = 0; i < len1; ++i) { + if (a1[i] == (T1)(a2[0])) { + out[i] = equal_flag; } else { - outputPtr[i] = notequal_flag; + out[i] = notequal_flag; } } } else { - return NOT_SUPPORTED; + ret = NOT_SUPPORTED; } - return SUCCESS; + return ret; } EE equal(Tensor inputTensor, @@ -64,37 +64,43 @@ EE equal(Tensor inputTensor, ArchInfo_t archInfo) { auto arch = archInfo->arch; - void *input = get_ptr_from_tensor(inputTensor, arch); - void *comparision = get_ptr_from_tensor(compareTensor, arch); - void *output = get_ptr_from_tensor(outputTensor, arch); TensorDesc inputDesc = inputTensor.get_desc(); U32 inputLen = tensorNumElements(inputDesc); - U32 comparisonLen = tensorNumElements(compareTensor.get_desc()); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc compareDesc = compareTensor.get_desc(); + U32 compareLen = tensorNumElements(compareDesc); + void *compare = get_ptr_from_tensor(compareTensor, arch); + void *output = get_ptr_from_tensor(outputTensor, arch); - EE ret = SUCCESS; + EE ret = NOT_SUPPORTED; switch (inputDesc.dt) { #ifdef _USE_FP32 case DT_F32: { - ret = equal_kernel( - inputLen, comparisonLen, (F32 *)input, (F32 *)comparision, p.invert, (U8 *)output); + ret = equal_kernel( + (F32 *)input, inputLen, (F32 *)compare, compareLen, p.invert, (U8 *)output); break; } #endif #ifdef _USE_FP16 case DT_F16: { - ret = equal_kernel( - inputLen, comparisonLen, (F16 *)input, (F32 *)comparision, p.invert, (U8 *)output); + if (compareDesc.dt == DT_F32) { + ret = equal_kernel( + (F16 *)input, inputLen, (F32 *)compare, compareLen, p.invert, (U8 *)output); + } else { + ret = equal_kernel( + (F16 *)input, inputLen, (F16 *)compare, compareLen, p.invert, (U8 *)output); + } break; } #endif case DT_I32: { - ret = equal_kernel( - inputLen, comparisonLen, (I32 *)input, (F32 *)comparision, p.invert, (U8 *)output); + ret = equal_kernel( + (I32 *)input, inputLen, (I32 *)compare, compareLen, p.invert, (U8 *)output); break; } default: - ret = NOT_SUPPORTED; break; } return ret; } +#endif diff --git a/compute/tensor/src/expand.cpp b/compute/tensor/src/expand.cpp index a34b9562..88c8316d 100644 --- a/compute/tensor/src/expand.cpp +++ b/compute/tensor/src/expand.cpp @@ -21,16 +21,16 @@ EE expand_infer_output_size( { TensorDesc inputDesc = inputTensor->get_desc(); TensorDesc outputDesc = inputDesc; - CHECK_REQUIREMENT((I32)inputDesc.nDims <= p.shape_size); - outputDesc.nDims = (U32)p.shape_size; + CHECK_REQUIREMENT((I32)inputDesc.nDims <= p.num_shape); + outputDesc.nDims = (U32)p.num_shape; I32 inputDims = inputDesc.nDims; - for (I32 i = 0; i < p.shape_size; ++i) { - I32 reverseDim = p.shape_size - 1 - i; + for (I32 i = 0; i < p.num_shape; ++i) { + I32 reverseDim = p.num_shape - 1 - i; if ((reverseDim >= inputDims) || (reverseDim < inputDims && inputDesc.dims[reverseDim] == 1)) { - outputDesc.dims[reverseDim] = p.shape_dims[i]; + outputDesc.dims[reverseDim] = p.shape[i]; } else { - CHECK_REQUIREMENT(p.shape_dims[i] <= (I32)inputDesc.dims[reverseDim]); + CHECK_REQUIREMENT(p.shape[i] <= (I32)inputDesc.dims[reverseDim]); outputDesc.dims[reverseDim] = inputDesc.dims[reverseDim]; } } @@ -41,6 +41,10 @@ EE expand_infer_output_size( } #endif } + if (outputDesc.dt == DT_F32 && outputDesc.nDims == 4 && + outputDesc.dims[outputDesc.nDims - 2] % 8 == 0) { + outputDesc.df = DF_NCHWC8; + } outputTensor->resize(outputDesc); return SUCCESS; } @@ -48,17 +52,20 @@ EE expand_infer_output_size( EE expand_infer_forward_tmp_bytes( Tensor inputTensor, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo) { + TensorDesc outputDesc = outputTensor.get_desc(); + TensorDesc inputDesc = inputTensor.get_desc(); if (IS_GPU(archInfo->arch)) { #ifdef _USE_GPU GCLMemDesc gclmemInputDesc = ocl_get_desc(inputTensor); GCLMemDesc gclmemOutputDesc = ocl_get_desc(outputTensor); - TensorDesc inputDesc = inputTensor.get_desc(); - TensorDesc outputDesc = outputTensor.get_desc(); CHECK_STATUS(expand_infer_forward_tmp_bytes_mali( inputDesc, outputDesc, gclmemInputDesc, gclmemOutputDesc, bytes)); #endif } else { *bytes = 0; + if (outputDesc.df != inputDesc.df) { + *bytes += tensorNumBytes(outputDesc); + } } return SUCCESS; } @@ -80,10 +87,10 @@ void expand_copy_kernel(U32 dims, if (dims == lastDims) { if (dims >= inDims || inD[dims] == 1) { for (U32 i = 0; i < outD[dims]; ++i) { - memcpy(output + i * minCopySize, input, minCopySize); + UNI_MEMCPY(output + i * minCopySize, input, minCopySize); } } else { - memcpy(output, input, minCopySize * inD[dims]); + UNI_MEMCPY(output, input, minCopySize * inD[dims]); } return; } @@ -97,7 +104,7 @@ void expand_copy_kernel(U32 dims, expand_copy_kernel( dims - 1, inDims, outDims, inD, outD, input, output, dt, lastDims, minCopySize); for (U32 i = 1; i < outD[dims]; ++i) { - memcpy(output + i * oOffSize, output, oOffSize); + UNI_MEMCPY(output + i * oOffSize, output, oOffSize); } return; } @@ -120,11 +127,14 @@ EE expand( auto arch = archInfo->arch; void *input = get_ptr_from_tensor(inputTensor, arch); void *output = get_ptr_from_tensor(outputTensor, arch); + void *tmp = get_ptr_from_tensor(tmpTensor, arch); TensorDesc inputDesc = inputTensor.get_desc(); TensorDesc outputDesc = outputTensor.get_desc(); + if (outputDesc.df != inputDesc.df) { + output = tmp; + } if (IS_GPU(arch)) { #ifdef _USE_GPU - void *tmp = get_ptr_from_tensor(tmpTensor, arch); CHECK_STATUS(expand_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, p, (GCLMem_t)tmp, outputDesc, (GCLMem_t)output)); #endif @@ -144,6 +154,11 @@ EE expand( expand_copy_kernel((outputDesc.nDims - 1), inputDesc.nDims, outputDesc.nDims, inputDesc.dims, outputDesc.dims, (U8 *)input, (U8 *)output, idt, lastDims, minCopySize); + if (outputDesc.df != inputDesc.df) { + TensorDesc oldDesc = outputDesc; + oldDesc.df = inputDesc.df; + transformFormat(oldDesc, output, outputDesc, get_ptr_from_tensor(outputTensor, arch)); + } } return SUCCESS; } diff --git a/compute/tensor/src/fully_connected.cpp b/compute/tensor/src/fully_connected.cpp index 811f2a65..99ff0b92 100644 --- a/compute/tensor/src/fully_connected.cpp +++ b/compute/tensor/src/fully_connected.cpp @@ -11,8 +11,6 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include - #include "tensor_computing.h" #include "blas_enhance.h" #ifdef _USE_GPU @@ -196,10 +194,12 @@ EE fully_connected_transform_filter_bytes(Tensor filterTensor, void *bytes, Arch } else if (IS_X86(archInfo->arch)) { alignments = 8; #ifdef _USE_INT8 + alignments = 16; + fh = (fh + 8 - 1) / 8 * 8; *size += UNI_MAX(fw, fh) * 4; #endif } - fh = (fh + alignments - 1) / alignments * alignments; + fw = (fw + alignments - 1) / alignments * alignments; *size += fw * fh + 32; *size *= bytesOf(fdt); } @@ -248,7 +248,7 @@ EE fully_connected_transform_filter_kernel(TensorDesc inputDesc, } } } else { - memcpy(filterTransformed, filter, tensorNumBytes(filterDesc)); + UNI_MEMCPY(filterTransformed, filter, tensorNumBytes(filterDesc)); } U32 fh_after = fh; @@ -391,13 +391,7 @@ EE fully_connected(Tensor inputTensor, qIDesc.dt = DT_I8; qODesc.dt = DT_I32; } - if (qIDesc.dt != idt) { - CHECK_STATUS(quantize_cpu(inputDesc, input, &qIDesc, tmp, &scaleI, arch)); - inputDesc = qIDesc; - idt = qIDesc.dt; - input = (U8 *)tmp; - tmp = (U8 *)tmp + tensorNumBytes(inputDesc); - } + CHECK_REQUIREMENT(idt == qIDesc.dt); scaleO = scaleI * filterTensor.get_scale(); if (IS_X86(arch)) { @@ -406,8 +400,10 @@ EE fully_connected(Tensor inputTensor, if (outputDesc.dt != qODesc.dt) { offsetC += tensorNumBytes(qODesc); } + void *transOffsetC = (void *)((U8 *)filter + + UNI_ALIGN(filterDesc.dims[0], 16) * UNI_ALIGN(filterDesc.dims[1], 8)); CHECK_STATUS(quantize_bias_offsetC( - bias, biasDesc, DT_I32, filter, filterDesc, &scaleO, offsetC)); + bias, biasDesc, DT_I32, transOffsetC, filterDesc, &scaleO, offsetC)); bias = nullptr; if (outputDesc.dt == DT_U8_Q && outputTensor.get_scale() > 0) { scale[1] = scale[1] / scaleO; @@ -421,7 +417,7 @@ EE fully_connected(Tensor inputTensor, CHECK_REQUIREMENT(DT_I8 == outputDesc.dt); biasDesc.dt = DT_I32; I32 *biasI = (I32 *)tmp; -#ifdef __aarch64__ +#ifdef _USE_FP16 F16 *biasF = (F16 *)bias; #else F32 *biasF = (F32 *)bias; @@ -452,11 +448,11 @@ EE fully_connected(Tensor inputTensor, U8 *outArray = (U8 *)output; U32 size = tensorNumBytes(biasDesc); for (U32 i = 0; i < M; i++) { - memcpy(outArray + i * size, bias, size); + UNI_MEMCPY(outArray + i * size, bias, size); } } } else { - memset(output, 0, tensorNumBytes(outputDesc)); + UNI_MEMSET(output, 0, tensorNumBytes(outputDesc)); } // If weight is transformed for mmm, don't run as mvm diff --git a/compute/tensor/src/gather.cpp b/compute/tensor/src/gather.cpp index 6feeaf54..a6eddbab 100644 --- a/compute/tensor/src/gather.cpp +++ b/compute/tensor/src/gather.cpp @@ -55,11 +55,8 @@ EE gather_infer_output_size(Tensor *dataTensor, ArchInfo_t archInfo) { auto arch = archInfo->arch; - if (dataTensor == nullptr) { - CHECK_STATUS(NULL_POINTER); - } - if (outputTensor == nullptr) { - CHECK_STATUS(NULL_POINTER); + if (dataTensor == nullptr || outputTensor == nullptr) { + return NULL_POINTER; } TensorDesc dataDesc = dataTensor->get_desc(); TensorDesc indexDesc = indexTensor->get_desc(); @@ -76,13 +73,20 @@ EE gather_infer_output_size(Tensor *dataTensor, } outputDesc.nDims = e + indexDesc.nDims; } else { - outputDesc = dataDesc; + outputDesc = indexDesc; + outputDesc.dt = dataDesc.dt; if (!p.element_level) { + outputDesc = dataDesc; if (tensorNumElements(indexDesc) == 1 && p.index_scalar) { for (int i = axis; i < (int)outputDesc.nDims - 1; i++) { outputDesc.dims[i] = outputDesc.dims[i + 1]; } - outputDesc.nDims--; + if (outputDesc.nDims > 1) { + outputDesc.nDims--; + } else { + outputDesc.dims[0] = 1; + outputDesc.df = DF_SCALAR; + } } else { for (int i = (int)outputDesc.nDims - 1; i > axis; i--) { outputDesc.dims[i + indexDesc.nDims - 1] = outputDesc.dims[i]; @@ -105,8 +109,16 @@ EE gather_infer_output_size(Tensor *dataTensor, } #endif } + EE ret = SUCCESS; +#ifdef _USE_CPU + if (tensorIsShape(dataDesc)) { + ret = gather_cpu(dataDesc, dataDesc.dims + dataDesc.nDims, indexDesc, + indexDesc.dims + indexDesc.nDims, p, nullptr, outputDesc, + outputDesc.dims + outputDesc.nDims); + } +#endif outputTensor->resize(outputDesc); - return SUCCESS; + return ret; } EE gather_infer_forward_tmp_bytes(Tensor dataTensor, @@ -117,13 +129,14 @@ EE gather_infer_forward_tmp_bytes(Tensor dataTensor, ArchInfo_t archInfo) { auto arch = archInfo->arch; - + EE ret = NOT_SUPPORTED; if (IS_CPU(arch)) { if (dataTensor.get_desc().df == DF_NCHWC8) { *bytes = dataTensor.bytes(); } else { *bytes = 0; } + ret = SUCCESS; #ifdef _USE_GPU } else if (IS_GPU(arch)) { TensorDesc dataDesc = dataTensor.get_desc(); @@ -131,9 +144,9 @@ EE gather_infer_forward_tmp_bytes(Tensor dataTensor, TensorDesc outputDesc = outputTensor.get_desc(); GCLMemDesc gclmemDataDesc = ocl_get_desc(dataTensor); GCLMemDesc gclmemOutputDesc = ocl_get_desc(outputTensor); - CHECK_STATUS(gather_infer_forward_tmp_bytes_mali( - dataDesc, gclmemDataDesc, indexDesc, p, outputDesc, gclmemOutputDesc, bytes)); + ret = gather_infer_forward_tmp_bytes_mali( + dataDesc, gclmemDataDesc, indexDesc, p, outputDesc, gclmemOutputDesc, bytes); #endif } - return SUCCESS; + return ret; } diff --git a/compute/tensor/src/gpu/mali/activation.cpp b/compute/tensor/src/gpu/mali/activation.cpp index d011a26f..9af9f23f 100644 --- a/compute/tensor/src/gpu/mali/activation.cpp +++ b/compute/tensor/src/gpu/mali/activation.cpp @@ -36,7 +36,8 @@ inline EE activation_checkpara_mali(GCLHandle_t handle, activationMode != ACTIVATION_H_SWISH && activationMode != ACTIVATION_GELU && activationMode != ACTIVATION_TANH && activationMode != ACTIVATION_SIGMOID && activationMode != ACTIVATION_ABS && activationMode != ACTIVATION_LOG && - activationMode != ACTIVATION_NEG) { + activationMode != ACTIVATION_NEG && activationMode != ACTIVATION_EXP && + activationMode != ACTIVATION_SWISH) { CHECK_STATUS(NOT_SUPPORTED); } if (input->desc.memFormat != output->desc.memFormat) { diff --git a/compute/tensor/src/gpu/mali/bilateral_slice_apply.cpp b/compute/tensor/src/gpu/mali/bilateral_slice_apply.cpp index 3113d809..e888381d 100644 --- a/compute/tensor/src/gpu/mali/bilateral_slice_apply.cpp +++ b/compute/tensor/src/gpu/mali/bilateral_slice_apply.cpp @@ -33,7 +33,7 @@ inline EE bilateral_slice_apply_checkpara_mali_common(GCLHandle_t handle, if (nullptr == handle || nullptr == input || nullptr == grid || nullptr == output) { return NULL_POINTER; } - if (bilateralSliceApplyParamSpec.mode == BSliceApply_NULL && nullptr == guide) { + if (bilateralSliceApplyParamSpec.mode == BSLICE_APPLY_NULL && nullptr == guide) { return NULL_POINTER; } if (inputDesc.df != guideDesc.df || inputDesc.df != gridDesc.df) { @@ -51,15 +51,14 @@ inline EE bilateral_slice_apply_checkpara_mali_common(GCLHandle_t handle, if (inputDesc.dims[2] != outputDesc.dims[2]) { return NOT_MATCH; } - if ((gridDesc.dims[2] % bilateralSliceApplyParamSpec.coefficient_len) != 0) { + if ((gridDesc.dims[2] % bilateralSliceApplyParamSpec.coefficient) != 0) { return NOT_MATCH; } if (bilateralSliceApplyParamSpec.has_offset == true) { - if (bilateralSliceApplyParamSpec.coefficient_len != - inputDesc.dims[2] * (inputDesc.dims[2] + 1)) { + if (bilateralSliceApplyParamSpec.coefficient != inputDesc.dims[2] * (inputDesc.dims[2] + 1)) { return NOT_MATCH; } - if (bilateralSliceApplyParamSpec.coefficient_len != 12) { + if (bilateralSliceApplyParamSpec.coefficient != 12) { return NOT_SUPPORTED; } } else { diff --git a/compute/tensor/src/gpu/mali/cast.cpp b/compute/tensor/src/gpu/mali/cast.cpp index 218f795d..b959807b 100644 --- a/compute/tensor/src/gpu/mali/cast.cpp +++ b/compute/tensor/src/gpu/mali/cast.cpp @@ -42,9 +42,9 @@ inline void set_dt_name(TensorDesc desc, char *name) { DataType dt = desc.dt; if (dt == DT_F16) { - strcpy(name, "f16"); + UNI_STRCPY(name, "f16"); } else if (dt == DT_I32) { - strcpy(name, "i32"); + UNI_STRCPY(name, "i32"); } else { CHECK_STATUS(NOT_SUPPORTED); } diff --git a/compute/tensor/src/gpu/mali/check.cpp b/compute/tensor/src/gpu/mali/check.cpp index 084b824a..80aea30d 100644 --- a/compute/tensor/src/gpu/mali/check.cpp +++ b/compute/tensor/src/gpu/mali/check.cpp @@ -41,7 +41,7 @@ inline EE check_checkpara_mali(GCLHandle_t handle, if (outputDesc.dt != DT_I32) { CHECK_STATUS(NOT_MATCH); } - if (p.check_mode != CHECK_EQUAL) { + if (p.mode != CHECK_EQUAL) { CHECK_STATUS(NOT_SUPPORTED); } return SUCCESS; diff --git a/compute/tensor/src/gpu/mali/cl/activation.cl b/compute/tensor/src/gpu/mali/cl/activation.cl index 590889f3..530ff37e 100644 --- a/compute/tensor/src/gpu/mali/cl/activation.cl +++ b/compute/tensor/src/gpu/mali/cl/activation.cl @@ -51,7 +51,7 @@ __kernel void MANGLE_NAME(activation_, IOM, FM, AM)(const int w, LOAD_MEM_V4_COMMON(val, idx, idy, idz, iw_str, ih_str, i_off, input); ACTIVATION_V4(val); -#if defined(USE_TANH) || defined(USE_SIGMOID) || defined(USE_HSIGMOID) || defined(USE_GELU) +#if defined(USE_TANH) || defined(USE_SIGMOID) || defined(USE_HSIGMOID) || defined(USE_GELU) || defined(USE_EXP) char ec = (((idz << 2) + 4) <= c) ? 4 : (c & 3); if (ec < 2) { val.y = 0; diff --git a/compute/tensor/src/gpu/mali/cl/col2im.cl b/compute/tensor/src/gpu/mali/cl/col2im.cl index ff20e0b4..2685cc74 100644 --- a/compute/tensor/src/gpu/mali/cl/col2im.cl +++ b/compute/tensor/src/gpu/mali/cl/col2im.cl @@ -48,7 +48,7 @@ __kernel void MANGLE_NAME(col2im_, IOM)(const int iw, int sidh_j = pidy % sh; int in_hx = (sidh_i < ih) ? sidh_i : (ih - 1); int in_hy = (sidh_i < ih) ? sidh_j : ((sidh_i - ih + 1) * sh + sidh_j); - int in_hl = (fw - in_hy + sh - 1) / sh; + int in_hl = (fh - in_hy + sh - 1) / sh; if (in_hl > in_hx + 1) { in_hl = in_hx + 1; } diff --git a/compute/tensor/src/gpu/mali/cl/conv_invgemm_col2img.cl b/compute/tensor/src/gpu/mali/cl/conv_invgemm_col2img.cl new file mode 100644 index 00000000..32fa8ce5 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_invgemm_col2img.cl @@ -0,0 +1,80 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, IOM, AM) base##IOM##AM +#define MANGLE_NAME(base, IOM, AM) MANGLE_NAME_IMPL(base, IOM, AM) + +__kernel void MANGLE_NAME(conv_invgemm_col2img_, IOM, AM)(const int iw, + const int ih, + const int fw, + const int fh, + const int pw, + const int ph, + const int ow_str, + const int oh_str, + const int o_off, + const int oc, + const int bx, + const int by, + __global const T *in, + __read_only image1d_t bias, + KERNEL_MEM out) +{ + const int idx = get_global_id(0); + const int idy = get_global_id(1); + const int idz = get_global_id(2); + const ushort c_pitch = (oc + 3) >> 2; + const int idc = idz % c_pitch; + if (idx >= bx || idy >= by) { + return; + } + + const int pidx = idx + pw; + const int pidy = idy + ph; + + int in_hx = (pidy < ih) ? pidy : (ih - 1); + int in_hy = (pidy < ih) ? 0 : (pidy - ih + 1); + int in_hl = fh - in_hy; + if (in_hl > in_hx + 1) { + in_hl = in_hx + 1; + } + if (pidy < 0) { + in_hl = 0; + } + + int in_wx = (pidx < iw) ? pidx : (iw - 1); + int in_wy = (pidx < iw) ? 0 : (pidx - iw + 1); + int in_wl = fw - in_wy; + if (in_wl > in_wx + 1) { + in_wl = in_wx + 1; + } + if (pidx < 0) { + in_wl = 0; + } + + int in_off_h = iw * (in_hx + ih * fw * (in_hy + idz * fh)); + int in_str_h = iw * (ih * fw - 1); + int in_off_w = in_wx + in_wy * ih * iw; + int in_str_w = ih * iw - 1; + T4 sum = read_imageh(bias, sampler, idc); + + for (int i = 0; i < in_hl; i++) { + for (int j = 0; j < in_wl; j++) { + sum += vload4(in_off_h + in_off_w + j * in_str_w, in); + } + in_off_h += in_str_h; + } + ACTIVATION_V4(sum); + STORE_MEM_V4_COMMON(sum, idx, idy, idz, ow_str, oh_str, o_off, out); +} diff --git a/compute/tensor/src/gpu/mali/cl/conv_invgemm_trans_flt.cl b/compute/tensor/src/gpu/mali/cl/conv_invgemm_trans_flt.cl new file mode 100644 index 00000000..e9402b61 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/conv_invgemm_trans_flt.cl @@ -0,0 +1,52 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "kernel_def.h" +#define MANGLE_NAME_IMPL(base, K) base##K +#define MANGLE_NAME(base, K) MANGLE_NAME_IMPL(base, K) + +__kernel void MANGLE_NAME(conv_invgemm_trans_flt_, K)(const int fw, + const int fh, + const int fwh, + const int fc, + const int fn, + __global const T *fltdata, + __global T *flt) +{ + int idx = get_global_id(0); + int idy = get_global_id(1); + int idz = get_global_id(2); + int iy = idy << 2; + const int flt_off = (idz * fc + iy) * fwh + idx; + T4 val = 0; + val.x = fltdata[flt_off]; + if (iy + 1 < fc) { + val.y = fltdata[flt_off + fwh]; + } + if (iy + 2 < fc) { + val.z = fltdata[flt_off + fwh * 2]; + } + if (iy + 3 < fc) { + val.w = fltdata[flt_off + fwh * 3]; + } + const int bc = (fc + 3) >> 2; + int ox = idz & 3; + int oy = idy; + int oz = (idz >> 2) * fwh + fwh - 1 - idx; + int K_pitch = K >> 2; + ox = ox + (oz % K_pitch) * 4; + oz = oz / K_pitch; + + int out_off = (oz * bc + oy) * K + ox; + vstore4(val, out_off, flt); +} diff --git a/compute/tensor/src/gpu/mali/cl/gemm_tn.cl b/compute/tensor/src/gpu/mali/cl/gemm_tn.cl index cdf2e23c..95f1a6fd 100644 --- a/compute/tensor/src/gpu/mali/cl/gemm_tn.cl +++ b/compute/tensor/src/gpu/mali/cl/gemm_tn.cl @@ -74,8 +74,7 @@ #if defined(USE_OUTPUT_IMG) #define ADD_C_OFF(off) \ { \ - \ - off.z += 1; \ + off.z += 1; \ } #else #define ADD_C_OFF(off) \ diff --git a/compute/tensor/src/gpu/mali/cl/kernel_def.h b/compute/tensor/src/gpu/mali/cl/kernel_def.h index edf721c3..81a451ce 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_def.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_def.h @@ -245,36 +245,36 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | #define LOAD_BUF_ARRAY5(v, off, buf) \ { \ T4 tmp = vload4(0, buf + off); \ - v[0] = tmp.s0; \ - v[1] = tmp.s1; \ - v[2] = tmp.s2; \ - v[3] = tmp.s3; \ + v[0] = tmp.x; \ + v[1] = tmp.y; \ + v[2] = tmp.z; \ + v[3] = tmp.w; \ v[4] = buf[off + 4]; \ } #define LOAD_BUF_ARRAY6(v, off, buf) \ - { \ + { \ T4 tmp = vload4(0, buf + off); \ + v[0] = tmp.x; \ + v[1] = tmp.y; \ + v[2] = tmp.z; \ + v[3] = tmp.w; \ T2 tmpex = vload2(0, buf + off + 4); \ - v[0] = tmp.s0; \ - v[1] = tmp.s1; \ - v[2] = tmp.s2; \ - v[3] = tmp.s3; \ - v[4] = tmpex.s0; \ - v[5] = tmpex.s1; \ + v[4] = tmpex.x; \ + v[5] = tmpex.y; \ } #define LOAD_BUF_ARRAY7(v, off, buf) \ { \ T4 tmp = vload4(0, buf + off); \ + v[0] = tmp.x; \ + v[1] = tmp.y; \ + v[2] = tmp.z; \ + v[3] = tmp.w; \ T3 tmpex = vload3(0, buf + off + 4); \ - v[0] = tmp.s0; \ - v[1] = tmp.s1; \ - v[2] = tmp.s2; \ - v[3] = tmp.s3; \ - v[4] = tmpex.s0; \ - v[5] = tmpex.s1; \ - v[6] = tmpex.s2; \ + v[4] = tmpex.x; \ + v[5] = tmpex.y; \ + v[6] = tmpex.z; \ } #define LOAD_BUF_ARRAY8(v, off, buf) \ @@ -1341,6 +1341,14 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | v.s2 = 1.0 / (1.0 + exp(-1.0 * v.s2)); \ v.s3 = 1.0 / (1.0 + exp(-1.0 * v.s3)); \ } +#elif defined(USE_SWISH) +#define ACTIVATION_V4(v) \ + { \ + v.s0 = v.s0 / (1.0 + exp(-1.0 * v.s0)); \ + v.s1 = v.s1 / (1.0 + exp(-1.0 * v.s1)); \ + v.s2 = v.s2 / (1.0 + exp(-1.0 * v.s2)); \ + v.s3 = v.s3 / (1.0 + exp(-1.0 * v.s3)); \ + } #elif defined(USE_ABS) #define ACTIVATION_V4(v) \ { \ @@ -1365,6 +1373,14 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | v.s2 = -v.s2; \ v.s3 = -v.s3; \ } +#elif defined(USE_EXP) +#define ACTIVATION_V4(v) \ + { \ + v.s0 = exp(v.s0); \ + v.s1 = exp(v.s1); \ + v.s2 = exp(v.s2); \ + v.s3 = exp(v.s3); \ + } #else #define ACTIVATION_V1(v) \ {} diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/activation_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/activation_opt.h index d4dd65c7..78067099 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/activation_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/activation_opt.h @@ -17,8 +17,9 @@ inline EE set_activation_opt_mali(bool useNchwFormat, CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName)); char modeName[128] = ""; CHECK_STATUS(set_activation_mode_name(activeMode, modeName)); - sprintf(kernelName, "activation_%s%s%s", ioMemName, formatName.c_str(), modeName); - sprintf(kernelOpt->sourceName, "activation"); + std::string kernel = std::string("activation_") + ioMemName + formatName + modeName; + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "activation"); kernelOpt->kernelDataType = dt; char *opt = kernelOpt->option; CHECK_STATUS(set_activation_define_opt(activeMode, opt)); diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/cast_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/cast_opt.h index 5422fcd5..bb7914d2 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/cast_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/cast_opt.h @@ -36,8 +36,9 @@ inline EE set_cast_opt_mali(bool useNchwFormat, } else { CHECK_STATUS(NOT_SUPPORTED); } - sprintf(kernelName, "cast_%s%s_to_%s", formatName.c_str(), idtName.c_str(), odtName.c_str()); - sprintf(kernelOpt->sourceName, "cast"); + std::string kernel = std::string("cast_") + formatName + idtName + std::string("_to_") + odtName; + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "cast"); if (useNchwFormat) { CHECK_STATUS(set_chars_define_opt("USE_NCHW", opt)); } diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/channel_resize_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/channel_resize_opt.h index a5e8ad24..188c7dfa 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/channel_resize_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/channel_resize_opt.h @@ -15,8 +15,9 @@ inline EE set_channel_resize_opt_mali(bool useNchwFormat, char ioMemName[128] = ""; CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName)); - sprintf(kernelName, "channel_resize_%s%s", ioMemName, formatName.c_str()); - sprintf(kernelOpt->sourceName, "channel_resize"); + std::string kernel = std::string("channel_resize_") + ioMemName + formatName; + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "channel_resize"); kernelOpt->kernelDataType = dt; char *opt = kernelOpt->option; if (useNchwFormat) { diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/clip_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/clip_opt.h index e2efdb24..0866f53a 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/clip_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/clip_opt.h @@ -10,14 +10,14 @@ inline EE set_clip_opt_mali(bool useNchwFormat, { char *opt = kernelOpt->option; kernelOpt->kernelDataType = dt; - std::string formatName = ""; - if (useNchwFormat) { - formatName = "nchw_"; - } char ioMemName[128] = ""; CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName)); - sprintf(kernelName, "clip_%s%s", ioMemName, formatName.c_str()); - sprintf(kernelOpt->sourceName, "clip"); + std::string name = "clip_" + std::string(ioMemName); + if (useNchwFormat) { + name += "nchw_"; + } + UNI_STRCPY(kernelName, name.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "clip"); if (useNchwFormat) { CHECK_STATUS(set_chars_define_opt("USE_NCHW", opt)); } diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/common_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/common_opt.h index cdf928c4..130d201a 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/common_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/common_opt.h @@ -9,7 +9,7 @@ inline EE set_chars_define_opt(const char *optName, char *&opt) std::string sopt = "-D"; sopt += optName; sopt += " "; - strcpy(opt, sopt.c_str()); + UNI_STRCPY(opt, sopt.c_str()); opt += sopt.length(); return SUCCESS; } @@ -22,7 +22,7 @@ inline EE set_value_define_opt(U32 val, const char *valName, char *&opt) sopt += "="; sopt += sval; sopt += " "; - strcpy(opt, sopt.c_str()); + UNI_STRCPY(opt, sopt.c_str()); opt += sopt.length(); return SUCCESS; } @@ -64,11 +64,17 @@ inline EE set_activation_define_opt(ActivationMode activeMode, char *&opt) case ACTIVATION_NEG: sopt = "-DUSE_NEG -D AM=neg_ "; break; + case ACTIVATION_EXP: + sopt = "-DUSE_EXP -D AM=exp_ "; + break; + case ACTIVATION_SWISH: + sopt = "-DUSE_SWISH -D AM=swish_ "; + break; default: CHECK_STATUS(NOT_SUPPORTED); break; } - strcpy(opt, sopt.c_str()); + UNI_STRCPY(opt, sopt.c_str()); opt += sopt.length(); return SUCCESS; } @@ -109,11 +115,17 @@ inline EE set_activation_mode_name(ActivationMode activeMode, char *name) case ACTIVATION_NEG: sname = "neg_"; break; + case ACTIVATION_EXP: + sname = "exp_"; + break; + case ACTIVATION_SWISH: + sname = "swish_"; + break; default: CHECK_STATUS(NOT_SUPPORTED); break; } - strcpy(name, sname.c_str()); + UNI_STRCPY(name, sname.c_str()); return SUCCESS; } @@ -143,7 +155,7 @@ inline EE set_eltwise_define_opt(EltwiseMode eltwiseMode, char *&opt) CHECK_STATUS(NOT_SUPPORTED); break; } - strcpy(opt, sopt.c_str()); + UNI_STRCPY(opt, sopt.c_str()); opt += sopt.length(); return SUCCESS; } @@ -174,7 +186,7 @@ inline EE set_eltwise_mode_name(EltwiseMode eltwiseMode, char *name) CHECK_STATUS(NOT_SUPPORTED); break; } - strcpy(name, sname.c_str()); + UNI_STRCPY(name, sname.c_str()); return SUCCESS; } @@ -198,7 +210,7 @@ inline EE set_io_mem_define_opt(GCLMemType inputType, GCLMemType outputType, cha } else { def += "-D IOM= "; } - strcpy(opt, def.c_str()); + UNI_STRCPY(opt, def.c_str()); opt += def.length(); return SUCCESS; } @@ -215,7 +227,7 @@ inline EE set_io_mem_name(GCLMemType inputType, GCLMemType outputType, char *nam } else if (useInputImg && useOutputImg) { sname = "iom_"; } - strcpy(name, sname.c_str()); + UNI_STRCPY(name, sname.c_str()); return SUCCESS; } @@ -273,7 +285,7 @@ inline EE set_io_mems_name_and_define_opts(GCLMemType *inputMemType, } CHECK_STATUS(set_chars_define_opt(iomDef.c_str(), opt)); - strcpy(name, iom.c_str()); + UNI_STRCPY(name, iom.c_str()); return SUCCESS; } @@ -291,7 +303,7 @@ inline EE set_data_type_name(DataType dt, char *name) } else { return NOT_SUPPORTED; } - strcpy(name, sname.c_str()); + UNI_STRCPY(name, sname.c_str()); return SUCCESS; } @@ -309,7 +321,7 @@ inline EE set_data_type_define_opt(DataType dt, char *&opt) } else { return NOT_SUPPORTED; } - strcpy(opt, sopt.c_str()); + UNI_STRCPY(opt, sopt.c_str()); opt += sopt.length(); return SUCCESS; } @@ -323,8 +335,9 @@ inline EE set_common_opt(DataType dt, { char ioMemName[128] = ""; CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName)); - sprintf(kernelName, "%s_%s", sourceName, ioMemName); - strcpy(kernelOpt->sourceName, sourceName); + std::string kernel = sourceName + std::string("_") + ioMemName; + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, sourceName); kernelOpt->kernelDataType = dt; char *opt = kernelOpt->option; CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt)); @@ -339,7 +352,7 @@ inline bool check_qualcomm_device(char *devName = nullptr) if (useQualcommDev) { dev = "_qc"; } - strcpy(devName, dev.c_str()); + UNI_STRCPY(devName, dev.c_str()); } return useQualcommDev; } @@ -347,7 +360,7 @@ inline bool check_qualcomm_device(char *devName = nullptr) inline EE add_qcom_acc_16_bit_opt(char *&opt) { std::string qcom_acc = "-qcom-accelerate-16-bit "; - strcpy(opt, qcom_acc.c_str()); + UNI_STRCPY(opt, qcom_acc.c_str()); opt += qcom_acc.length(); return SUCCESS; } diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/concat_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/concat_opt.h index bc6e723d..95ebf549 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/concat_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/concat_opt.h @@ -46,11 +46,12 @@ inline EE set_concat_opt_mali(U32 concatDim, char iomName[128] = ""; CHECK_STATUS( set_io_mems_name_and_define_opts(inputMemType, &outputMemType, inputNum, 1, iomName, opt)); - sprintf(kernelName, "concat_%s%s%s%d", formatName.c_str(), iomName, dimName.c_str(), inputNum); + std::string kernel = "concat_" + formatName + iomName + dimName + std::to_string(inputNum); + UNI_STRCPY(kernelName, kernel.c_str()); if (useNchwFormat) { - sprintf(kernelOpt->sourceName, "concat_nchw"); + UNI_STRCPY(kernelOpt->sourceName, "concat_nchw"); } else { - sprintf(kernelOpt->sourceName, "concat"); + UNI_STRCPY(kernelOpt->sourceName, "concat"); } kernelOpt->kernelDataType = dt; CHECK_STATUS(set_value_define_opt(inputNum, "N", opt)); diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/conv_depthwise_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/conv_depthwise_opt.h index 44137315..2db31b5a 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/conv_depthwise_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/conv_depthwise_opt.h @@ -12,8 +12,10 @@ inline EE set_conv_depthwise_trans_flt(U32 workFiltersPerThread, kernelOpt->kernelDataType = dt; char *opt = kernelOpt->option; CHECK_STATUS(set_io_mem_name(GCL_MEM_BUF, outputMemType, ioMemName)); - sprintf(kernelName, "conv_depthwise_trans_fltbuf_%s%d", ioMemName, item_k); - sprintf(kernelOpt->sourceName, "conv_depthwise_trans_fltbuf"); + std::string kernel = + std::string("conv_depthwise_trans_fltbuf_") + ioMemName + std::to_string(item_k); + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "conv_depthwise_trans_fltbuf"); CHECK_STATUS(set_value_define_opt(item_k, "K", opt)); CHECK_STATUS(set_io_mem_define_opt(GCL_MEM_BUF, outputMemType, opt)); return SUCCESS; @@ -54,10 +56,11 @@ inline EE set_conv_depthwise_opt_mali(U32 fw, if (outputNchwMode) { formatName = "nchw_"; } - sprintf(kernelName, "conv_depthwise_sh%d%s_%s%s%s%d%d%d", sh, devName, ioMemName, modeName, - formatName.c_str(), fw, fh, ON); - - sprintf(kernelOpt->sourceName, "conv_depthwise_sh%d%s", sh, devName); + std::string source = std::string("conv_depthwise_sh") + std::to_string(sh) + devName; + std::string kernel = source + std::string("_") + ioMemName + modeName + formatName + + std::to_string(fw) + std::to_string(fh) + std::to_string(ON); + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, source.c_str()); kernelOpt->kernelDataType = dt; char *opt = kernelOpt->option; if (ON < 1 || ON > 8) { @@ -152,10 +155,13 @@ inline EE set_conv_depthwise_dila_opt_mali(U32 fw, if (outputNchwMode) { formatName = "nchw_"; } - sprintf(kernelName, "conv_depthwise_sh%d_%s%s%s%s%d%d%d", sh, dilaMode.c_str(), ioMemName, - modeName, formatName.c_str(), fw, fh, ON); - - sprintf(kernelOpt->sourceName, "conv_depthwise_sh%d_dila", sh); + std::string kernel = std::string("conv_depthwise_sh") + std::to_string(sh) + std::string("_") + + dilaMode + ioMemName + modeName + formatName + std::to_string(fw) + std::to_string(fh) + + std::to_string(ON); + UNI_STRCPY(kernelName, kernel.c_str()); + std::string source = + std::string("conv_depthwise_sh") + std::to_string(sh) + std::string("_dila"); + UNI_STRCPY(kernelOpt->sourceName, source.c_str()); kernelOpt->kernelDataType = dt; char *opt = kernelOpt->option; if (ON < 1 || ON > 8) { diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/conv_direct_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/conv_direct_opt.h index 965a1b4c..60e901f0 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/conv_direct_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/conv_direct_opt.h @@ -21,9 +21,10 @@ inline EE set_conv_direct_trans_flt(U32 workChannelsPerThread, transWHName = "hw_"; CHECK_STATUS(set_chars_define_opt("USE_TRANS_WH", opt)); } - sprintf(kernelName, "conv_direct_trans_flt_%s%s%d%d", ioMemName, transWHName.c_str(), item_c, - item_k); - sprintf(kernelOpt->sourceName, "conv_direct_trans_flt"); + std::string kernel = std::string("conv_direct_trans_flt_") + ioMemName + transWHName + + std::to_string(item_c) + std::to_string(item_k); + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "conv_direct_trans_flt"); CHECK_STATUS(set_value_define_opt(item_c, "C", opt)); CHECK_STATUS(set_value_define_opt(item_k, "K", opt)); CHECK_STATUS(set_io_mem_define_opt(GCL_MEM_BUF, outputMemType, opt)); @@ -81,15 +82,18 @@ inline EE set_conv_direct_opt_mali(U32 fw, biasName = "nobias_"; } + std::string kernel, source; if (ft > 1) { - sprintf(kernelName, "conv_direct_3d_sh%d%s_%s%s%s%d%d%d%d%d", sh, devName, ioMemName, - modeName, biasName.c_str(), fw, fh, ft, ON, KN); - sprintf(kernelOpt->sourceName, "conv_direct_3d_sh%d%s", sh, devName); + source = std::string("conv_direct_3d_sh") + std::to_string(sh) + std::string(devName); + kernel = source + std::string("_") + ioMemName + modeName + biasName + std::to_string(fw) + + std::to_string(fh) + std::to_string(ft) + std::to_string(ON) + std::to_string(KN); } else { - sprintf(kernelName, "conv_direct_sh%d%s_%s%s%s%d%d%d%d", sh, devName, ioMemName, modeName, - biasName.c_str(), fw, fh, ON, KN); - sprintf(kernelOpt->sourceName, "conv_direct_sh%d%s", sh, devName); + source = std::string("conv_direct_sh") + std::to_string(sh) + std::string(devName); + kernel = source + std::string("_") + ioMemName + modeName + biasName + std::to_string(fw) + + std::to_string(fh) + std::to_string(ON) + std::to_string(KN); } + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, source.c_str()); kernelOpt->kernelDataType = dt; char *opt = kernelOpt->option; @@ -192,9 +196,11 @@ inline EE set_conv_direct_multi_batch_opt_mali(U32 fw, CHECK_STATUS(set_activation_mode_name(activeMode, modeName)); char ioMemName[128] = ""; CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName)); - sprintf(kernelName, "conv_direct_multi_batch_sh%d_%s%s%d%d%d%d%d", sh, ioMemName, modeName, fw, - fh, ON, KN, BN); - sprintf(kernelOpt->sourceName, "conv_direct_multi_batch_sh%d", sh); + std::string source = std::string("conv_direct_multi_batch_sh") + std::to_string(sh); + std::string kernel = source + std::string("_") + ioMemName + modeName + std::to_string(fw) + + std::to_string(fh) + std::to_string(ON) + std::to_string(KN) + std::to_string(BN); + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, source.c_str()); kernelOpt->kernelDataType = dt; char *opt = kernelOpt->option; if (ON < 1 || ON > 8) { @@ -284,8 +290,10 @@ inline EE set_conv_direct_reuse_w_opt_mali(U32 fw, CHECK_STATUS(set_activation_mode_name(activeMode, modeName)); char ioMemName[128] = ""; CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName)); - sprintf(kernelName, "conv_direct_sw1_reuse_w_%s%s%d%d%d%d", ioMemName, modeName, fw, fh, ON, KN); - sprintf(kernelOpt->sourceName, "conv_direct_sw1_reuse_w"); + std::string kernel = std::string("conv_direct_sw1_reuse_w_") + ioMemName + modeName + + std::to_string(fw) + std::to_string(fh) + std::to_string(ON) + std::to_string(KN); + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "conv_direct_sw1_reuse_w"); kernelOpt->kernelDataType = dt; char *opt = kernelOpt->option; if (ON < 1 || ON > 8) { @@ -353,15 +361,20 @@ inline EE set_conv_direct_nchw_to_nchwc4_opt_mali(U32 fw, CHECK_STATUS(set_activation_mode_name(activeMode, modeName)); char ioMemName[128] = ""; CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName)); + std::string kernel, source; if (ft > 1) { - sprintf(kernelName, "conv_direct_3d_sw%d_nchw_to_nchwc4%s_%s%s%d%d%d%d", sw, devName, - ioMemName, modeName, fw, fh, ft, ON); - sprintf(kernelOpt->sourceName, "conv_direct_3d_sw%d_nchw_to_nchwc4%s", sw, devName); + source = std::string("conv_direct_3d_sw") + std::to_string(sw) + + std::string("_nchw_to_nchwc4") + devName; + kernel = source + std::string("_") + ioMemName + modeName + std::to_string(fw) + + std::to_string(fh) + std::to_string(ft) + std::to_string(ON); } else { - sprintf(kernelName, "conv_direct_sw%d_nchw_to_nchwc4%s_%s%s%d%d%d", sw, devName, ioMemName, - modeName, fw, fh, ON); - sprintf(kernelOpt->sourceName, "conv_direct_sw%d_nchw_to_nchwc4%s", sw, devName); + source = std::string("conv_direct_sw") + std::to_string(sw) + + std::string("_nchw_to_nchwc4") + devName; + kernel = source + std::string("_") + ioMemName + modeName + std::to_string(fw) + + std::to_string(fh) + std::to_string(ON); } + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, source.c_str()); kernelOpt->kernelDataType = dt; char *opt = kernelOpt->option; @@ -441,9 +454,13 @@ inline EE set_conv_direct_dila_opt_mali(U32 fw, if (dh == 2) { dilaMode = "dila2_"; } - sprintf(kernelName, "conv_direct_sh%d%s_%s%s%s%d%d%d%d", sh, devName, dilaMode.c_str(), - ioMemName, modeName, fw, fh, ON, KN); - sprintf(kernelOpt->sourceName, "conv_direct_sh%d%s_dila", sh, devName); + std::string kernel = std::string("conv_direct_sh") + std::to_string(sh) + devName + + std::string("_") + dilaMode + ioMemName + modeName + std::to_string(fw) + + std::to_string(fh) + std::to_string(ON) + std::to_string(KN); + std::string source = + std::string("conv_direct_sh") + std::to_string(sh) + devName + std::string("_dila"); + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, source.c_str()); kernelOpt->kernelDataType = dt; char *opt = kernelOpt->option; if (ON < 1 || ON > 8) { @@ -517,9 +534,10 @@ inline EE set_conv_direct_sh1_fn_spe_opt_mali(U32 fw, if (useNchwFormat) { formatName = "nchw_"; } - sprintf(kernelName, "conv_direct_sh1_fn_spe_%s%s%s%d%d%d", ioMemName, modeName, - formatName.c_str(), fw, fh, ON); - sprintf(kernelOpt->sourceName, "conv_direct_sh1_fn_spe"); + std::string buffer = std::string("conv_direct_sh1_fn_spe_") + ioMemName + modeName + + formatName + std::to_string(fw) + std::to_string(fh) + std::to_string(ON); + UNI_STRCPY(kernelName, buffer.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "conv_direct_sh1_fn_spe"); kernelOpt->kernelDataType = dt; char *opt = kernelOpt->option; diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/conv_invgemm_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/conv_invgemm_opt.h new file mode 100644 index 00000000..13973247 --- /dev/null +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/conv_invgemm_opt.h @@ -0,0 +1,37 @@ +#ifndef CONV_INVGEMM_OPT +#define CONV_INVGEMM_OPT +#include "common_opt.h" +inline EE set_conv_invgemm_trans_flt_opt( + U32 workFiltersPerThread, DataType dt, char *kernelName, KernelOpt *kernelOpt) +{ + kernelOpt->kernelDataType = dt; + char *opt = kernelOpt->option; + U32 item_k = workFiltersPerThread; + std::string kernel = std::string("conv_invgemm_trans_flt_") + std::to_string(item_k); + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "conv_invgemm_trans_flt"); + CHECK_STATUS(set_value_define_opt(item_k, "K", opt)); + return SUCCESS; +} + +inline EE set_conv_invgemm_col2img_opt(ActivationMode activeMode, + DataType dt, + GCLMemType inputMemType, + GCLMemType outputMemType, + char *kernelName, + KernelOpt *kernelOpt) +{ + char *opt = kernelOpt->option; + kernelOpt->kernelDataType = dt; + char modeName[128]; + CHECK_STATUS(set_activation_mode_name(activeMode, modeName)); + char ioMemName[128] = ""; + CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName)); + std::string kernel = std::string("conv_invgemm_col2img_") + ioMemName + modeName; + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "conv_invgemm_col2img"); + CHECK_STATUS(set_activation_define_opt(activeMode, opt)); + CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt)); + return SUCCESS; +} +#endif diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/conv_wino_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/conv_wino_opt.h index 9ece942c..17f7e988 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/conv_wino_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/conv_wino_opt.h @@ -7,13 +7,14 @@ inline EE set_conv_wino_rotate_flt( kernelOpt->kernelDataType = dt; char *opt = kernelOpt->option; U32 fwh = fw * fh; - sprintf(kernelName, "conv_wino_rotate_fltbuf_%d", fwh); - sprintf(kernelOpt->sourceName, "conv_wino_rotate_fltbuf"); + std::string kernel = std::string("conv_wino_rotate_fltbuf_") + std::to_string(fwh); + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "conv_wino_rotate_fltbuf"); CHECK_STATUS(set_value_define_opt(fwh, "FWH", opt)); return SUCCESS; } -inline EE set_conv_wino_preprocess_input_opt(DataType dt, +inline EE set_conv_wino_preprocess_input_opt(DataType dt, bool useNchwFormat, GCLMemType inputMemType, GCLMemType outputMemType, @@ -24,12 +25,13 @@ inline EE set_conv_wino_preprocess_input_opt(DataType dt, char *opt = kernelOpt->option; kernelOpt->kernelDataType = dt; CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName)); - std::string formatName= ""; + std::string formatName = ""; if (useNchwFormat) { formatName = "nchw"; } - sprintf(kernelName, "conv_wino_preprocess_input_%s%s", ioMemName, formatName.c_str()); - sprintf(kernelOpt->sourceName, "conv_wino_preprocess_input"); + std::string kernel = std::string("conv_wino_preprocess_input_") + ioMemName + formatName; + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "conv_wino_preprocess_input"); CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt)); if (useNchwFormat) { CHECK_STATUS(set_chars_define_opt("USE_NCHW", opt)); @@ -55,8 +57,9 @@ inline EE set_conv_wino_trans_outbuf_opt(bool useAlign, if (useAlign) { alignName = "align"; } - sprintf(kernelName, "conv_wino_trans_outbuf_%s%s%s", ioMemName, modeName, alignName.c_str()); - sprintf(kernelOpt->sourceName, "conv_wino_trans_outbuf"); + std::string kernel = std::string("conv_wino_trans_outbuf_") + ioMemName + modeName + alignName; + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "conv_wino_trans_outbuf"); CHECK_STATUS(set_activation_define_opt(activeMode, opt)); if (useAlign) { CHECK_STATUS(set_chars_define_opt("USE_ALIGN", opt)); diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/copy_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/copy_opt.h index d3463fcf..8707b599 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/copy_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/copy_opt.h @@ -28,8 +28,9 @@ inline EE set_copy_opt_mali(bool useBlockIndex, DataType dt, char *kernelName, K CHECK_STATUS(NOT_SUPPORTED); } - sprintf(kernelName, "copy_%s%s", BINDName.c_str(), dtName.c_str()); - sprintf(kernelOpt->sourceName, "copy"); + std::string kernel = std::string("copy_") + BINDName + dtName; + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "copy"); kernelOpt->kernelDataType = dt; if (useBlockIndex) { CHECK_STATUS(set_chars_define_opt("USE_BLOCK_INDEX", opt)); diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/deconv_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/deconv_opt.h index a60098aa..2f031a4f 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/deconv_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/deconv_opt.h @@ -15,8 +15,10 @@ inline EE set_deconv_gemm_trans_fltbuf(U32 workChannelsPerThread, char *opt = kernelOpt->option; CHECK_STATUS(set_io_mem_name(GCL_MEM_BUF, outputMemType, ioMemName)); - sprintf(kernelName, "deconv_gemm_trans_fltbuf_%d%d", item_c, item_k); - sprintf(kernelOpt->sourceName, "deconv_gemm_trans_fltbuf"); + std::string kernel = + std::string("deconv_gemm_trans_fltbuf_") + std::to_string(item_c) + std::to_string(item_k); + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "deconv_gemm_trans_fltbuf"); CHECK_STATUS(set_value_define_opt(item_c, "C", opt)); CHECK_STATUS(set_value_define_opt(item_k, "K", opt)); CHECK_STATUS(set_io_mem_define_opt(GCL_MEM_BUF, outputMemType, opt)); @@ -131,9 +133,11 @@ inline EE set_deconv_gemm_f2s2_opt(U32 workChannelsPerThread, if (reuseOnW) { reuseOnWName = "w_"; } - sprintf(kernelName, "deconv_gemm_f2s2%s_%s%s%s%d%d", devName, reuseOnWName.c_str(), ioMemName, - modeName, ON, KN); - sprintf(kernelOpt->sourceName, "deconv_gemm_f2s2%s", devName); + std::string source = std::string("deconv_gemm_f2s2") + devName; + std::string kernel = source + std::string("_") + reuseOnWName + ioMemName + modeName + + std::to_string(ON) + std::to_string(KN); + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, source.c_str()); U32 IN = ON; U32 LN = ON; CHECK_STATUS(set_value_define_opt(ON, "ON", opt)); diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/depth2space_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/depth2space_opt.h index c00eeae7..97bc43d8 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/depth2space_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/depth2space_opt.h @@ -17,8 +17,9 @@ inline EE set_depth2space_nchwc4_2x2_opt(bool useOutputNchw, char ioMemName[128] = ""; CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName)); kernelOpt->kernelDataType = DT_F16; - sprintf(kernelName, "depth2space_nchwc4_2x2_%s%s", ioMemName, outputFormatName.c_str()); - sprintf(kernelOpt->sourceName, "depth2space_nchwc4_2x2"); + std::string kernel = std::string("depth2space_nchwc4_2x2_") + ioMemName + outputFormatName; + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "depth2space_nchwc4_2x2"); if (useOutputNchw) { CHECK_STATUS(set_chars_define_opt("OUT_NCHW", opt)); } diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/eltwise_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/eltwise_opt.h index 8b81788b..8bb8fae8 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/eltwise_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/eltwise_opt.h @@ -24,9 +24,10 @@ inline EE set_eltwise_opt_mali(U32 inputNum, char iomName[128] = ""; CHECK_STATUS( set_io_mems_name_and_define_opts(inputMemType, &outputMemType, inputNum, 1, iomName, opt)); - sprintf( - kernelName, "eltwise_%s%s%s%s%d", iomName, actName, eltName, formatName.c_str(), inputNum); - sprintf(kernelOpt->sourceName, "eltwise"); + std::string kernel = std::string("eltwise_") + iomName + actName + eltName + formatName + + std::to_string(inputNum); + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "eltwise"); kernelOpt->kernelDataType = dt; CHECK_STATUS(set_value_define_opt(inputNum, "N", opt)); CHECK_STATUS(set_activation_define_opt(activeMode, opt)); @@ -74,9 +75,10 @@ inline EE set_eltwise_broadcast_opt_mali(bool useNchwFormat, char iomName[128] = ""; CHECK_STATUS(set_io_mems_name_and_define_opts(inputMemType, &outputMemType, 2, 1, iomName, opt)); - sprintf(kernelName, "eltwise_broadcast_%s%s%s%s%s%s", iomName, actName, eltName, - swapInputName.c_str(), formatName.c_str(), axisName.c_str()); - sprintf(kernelOpt->sourceName, "eltwise_broadcast"); + std::string kernel = std::string("eltwise_broadcast_") + iomName + actName + eltName + + swapInputName + formatName + axisName; + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "eltwise_broadcast"); CHECK_STATUS(set_activation_define_opt(activeMode, opt)); CHECK_STATUS(set_eltwise_define_opt(eltwiseMode, opt)); if (useNchwFormat) { diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/expand_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/expand_opt.h index b674305c..ab0b8597 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/expand_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/expand_opt.h @@ -11,9 +11,10 @@ inline EE set_expand_opt_mali(U32 nDims, char *opt = kernelOpt->option; char ioMemName[128] = ""; CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName)); + std::string kernel = std::string("expand_") + ioMemName + std::to_string(nDims); + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "expand"); kernelOpt->kernelDataType = dt; - sprintf(kernelName, "expand_%s%d", ioMemName, nDims); - sprintf(kernelOpt->sourceName, "expand"); CHECK_STATUS(set_value_define_opt(nDims, "DN", opt)); CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt)); return SUCCESS; diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/fill_memory_zero_vec4_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/fill_memory_zero_vec4_opt.h index b7f25cd6..4f89255e 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/fill_memory_zero_vec4_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/fill_memory_zero_vec4_opt.h @@ -10,8 +10,10 @@ inline EE set_fill_memory_zero_vec4_opt_mali( kernelOpt->kernelDataType = dt; char dtName[128]; CHECK_STATUS(set_data_type_name(dt, dtName)); - sprintf(kernelName, "fill_memory_zero_vec4_%s%s", ioMemName, dtName); - sprintf(kernelOpt->sourceName, "fill_memory_zero_vec4"); + std::string buffer = + std::string("fill_memory_zero_vec4_") + std::string(ioMemName) + std::string(dtName); + UNI_STRCPY(kernelName, buffer.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "fill_memory_zero_vec4"); CHECK_STATUS(set_data_type_define_opt(dt, opt)); CHECK_STATUS(set_io_mem_define_opt(GCL_MEM_BUF, outputMemType, opt)); return SUCCESS; diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/gemm_tn_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/gemm_tn_opt.h index 8aeec7fa..409d2755 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/gemm_tn_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/gemm_tn_opt.h @@ -113,9 +113,11 @@ inline EE set_gemm_tn_opt_mali(U32 item_m, matCMemName = "cm_"; CHECK_STATUS(set_chars_define_opt("USE_OUTPUT_IMG", opt)); } - sprintf(kernelName, "gemm_tn%s_%s%s%s%s%s%s%d%d", devName, matAMemName.c_str(), matBMemName.c_str(), - matCMemName.c_str(), modeName, formatName.c_str(), biasName.c_str(), item_m, item_n); - sprintf(kernelOpt->sourceName, "gemm_tn%s", devName); + std::string source = std::string("gemm_tn") + devName; + std::string kernel = source + std::string("_") + matAMemName + matBMemName + matCMemName + + modeName + formatName + biasName + std::to_string(item_m) + std::to_string(item_n); + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, source.c_str()); kernelOpt->kernelDataType = dt; U32 UN = item_n - 1; CHECK_STATUS(set_value_define_opt(item_m, "LM", opt)); diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/gemv_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/gemv_opt.h index ffac49c2..6f479ac8 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/gemv_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/gemv_opt.h @@ -15,8 +15,9 @@ inline EE set_gemv_trans_mat_opt(U32 workMatChannelsPerThread, CHECK_STATUS(set_chars_define_opt("USE_TRANS_CK", opt)); transName = "kc_"; } - sprintf(kernelName, "gemv_trans_mat_%s%d", transName.c_str(), C); - sprintf(kernelOpt->sourceName, "gemv_trans_mat"); + std::string kernel = std::string("gemv_trans_mat_") + transName + std::to_string(C); + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "gemv_trans_mat"); kernelOpt->kernelDataType = dt; CHECK_STATUS(set_value_define_opt(C, "C", opt)); return SUCCESS; @@ -64,9 +65,11 @@ inline EE set_gemv_opt(U32 workMatChannelsPerThread, reduceName = "_reduce"; } - sprintf(kernelName, "gemv%s_%s%s%s%d", reduceName.c_str(), modeName, outFormatName.c_str(), - biasName.c_str(), OC); - sprintf(kernelOpt->sourceName, "gemv%s", reduceName.c_str()); + std::string source = "gemv" + reduceName; + std::string kernel = + source + std::string("_") + modeName + outFormatName + biasName + std::to_string(OC); + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, source.c_str()); kernelOpt->kernelDataType = dt; CHECK_STATUS(set_value_define_opt(OC, "OC", opt)); CHECK_STATUS(set_activation_define_opt(activeMode, opt)); diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/mem_trans_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/mem_trans_opt.h index fb7b83f0..40559f4d 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/mem_trans_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/mem_trans_opt.h @@ -46,14 +46,15 @@ inline EE set_mem_trans_opt_mali(MemTransFormType type, default: CHECK_STATUS(NOT_MATCH); } - sprintf(kernelName, "mem_trans_%s%s%s%s", use3dFormat.c_str(), ioMemName, inputFormat.c_str(), - outputFormat.c_str()); - kernelOpt->kernelDataType = dt; + std::string kernel = + std::string("mem_trans_") + use3dFormat + ioMemName + inputFormat + outputFormat; + UNI_STRCPY(kernelName, kernel.c_str()); if (use3dMode) { - sprintf(kernelOpt->sourceName, "mem_trans_3d"); + UNI_STRCPY(kernelOpt->sourceName, "mem_trans_3d"); } else { - sprintf(kernelOpt->sourceName, "mem_trans"); + UNI_STRCPY(kernelOpt->sourceName, "mem_trans"); } + kernelOpt->kernelDataType = dt; CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt)); return SUCCESS; } @@ -83,8 +84,9 @@ inline EE set_mem_trans_c_opt_mali(MemTransCType type, } else { CHECK_STATUS(NOT_MATCH); } - sprintf(kernelName, "mem_trans_c_%s%s", ioMemName, transFormat.c_str()); - sprintf(kernelOpt->sourceName, "mem_trans_c"); + std::string kernel = std::string("mem_trans_c_") + ioMemName + transFormat; + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "mem_trans_c"); kernelOpt->kernelDataType = dt; CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt)); return SUCCESS; diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/normalization_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/normalization_opt.h index 6eeec422..d93e8c8d 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/normalization_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/normalization_opt.h @@ -5,13 +5,12 @@ inline EE set_normalization_opt_mali( bool useNchwFormat, DataType dt, char *kernelName, KernelOpt *kernelOpt) { - std::string formatName = ""; + std::string kernel = "normalization"; if (useNchwFormat) { - formatName = "_nchw"; + kernel += "_nchw"; } - - sprintf(kernelName, "normalization%s", formatName.c_str()); - sprintf(kernelOpt->sourceName, "normalization"); + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "normalization"); kernelOpt->kernelDataType = dt; char *opt = kernelOpt->option; if (useNchwFormat) { diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/padding_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/padding_opt.h index 89e544d4..324505e7 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/padding_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/padding_opt.h @@ -11,36 +11,36 @@ inline EE set_padding_opt_mali(bool useNchwFormat, { kernelOpt->kernelDataType = dt; char *opt = kernelOpt->option; - std::string formatName = ""; + std::string name = "padding_"; if (useNchwFormat) { - formatName = "nchw_"; + name += "nchw_"; } char ioMemName[128] = ""; CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName)); - std::string modeName = ""; + std::string modeName; switch (mode) { - case Pad_Constant: + case PAD_CONSTANT: modeName = "constant"; CHECK_STATUS(set_chars_define_opt("USE_CONSTANT", opt)); break; - case Pad_Edge: + case PAD_EDGE: modeName = "edge"; CHECK_STATUS(set_chars_define_opt("USE_EDGE", opt)); break; - case Pad_Reflect: + case PAD_REFLECT: modeName = "reflect"; CHECK_STATUS(set_chars_define_opt("USE_REFLECT", opt)); break; - case Pad_Symmetric: + case PAD_SYMMETRIC: modeName = "symmetric"; CHECK_STATUS(set_chars_define_opt("USE_SYMMETRIC", opt)); break; default: return NOT_SUPPORTED; } - - sprintf(kernelName, "padding_%s%s", formatName.c_str(), modeName.c_str()); - sprintf(kernelOpt->sourceName, "padding"); + name += modeName; + UNI_STRCPY(kernelName, name.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "padding"); if (useNchwFormat) { CHECK_STATUS(set_chars_define_opt("USE_NCHW", opt)); } diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/pooling_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/pooling_opt.h index 6333dc34..f85c79b4 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/pooling_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/pooling_opt.h @@ -22,8 +22,9 @@ inline EE set_pooling_opt_mali(PoolingMode mode, } char ioMemName[128] = ""; CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName)); - sprintf(kernelName, "pooling_%s%s", ioMemName, modeName.c_str()); - sprintf(kernelOpt->sourceName, "pooling"); + std::string kernel = std::string("pooling_") + ioMemName + modeName; + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "pooling"); kernelOpt->kernelDataType = dt; CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt)); return SUCCESS; diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/power_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/power_opt.h index 97c590c4..173da092 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/power_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/power_opt.h @@ -20,8 +20,9 @@ inline EE set_power_opt_mali(bool useNchwFormat, } char ioMemName[128] = ""; CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName)); - sprintf(kernelName, "power_%s%s%s", ioMemName, formatName.c_str(), dtName.c_str()); - sprintf(kernelOpt->sourceName, "power"); + std::string kernel = std::string("power_") + ioMemName + formatName + dtName; + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "power"); kernelOpt->kernelDataType = dt; char *opt = kernelOpt->option; if (useNchwFormat) { diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/prelu_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/prelu_opt.h index 73730241..4c412457 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/prelu_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/prelu_opt.h @@ -36,9 +36,9 @@ inline EE set_prelu_opt_mali(bool propagate_down, } char ioMemName[128] = ""; CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName)); - sprintf(kernelName, "prelu_%s%s%s%s", ioMemName, formatName.c_str(), reluAxisName.c_str(), - progName.c_str()); - sprintf(kernelOpt->sourceName, "prelu"); + std::string kernel = std::string("prelu_") + ioMemName + formatName + reluAxisName + progName; + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "prelu"); kernelOpt->kernelDataType = dt; if (useNchwFormat) { CHECK_STATUS(set_chars_define_opt("USE_NCHW", opt)); diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/reduction_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/reduction_opt.h index 8418c515..d31e91c6 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/reduction_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/reduction_opt.h @@ -51,12 +51,13 @@ inline EE set_reduction_opt_mali(bool useNchwFormat, return NOT_SUPPORTED; } - sprintf(kernelName, "reduction_%s%s%s%d", formatName.c_str(), outputC4Name.c_str(), - modeName.c_str(), axis); + std::string kernel = + std::string("reduction_") + formatName + outputC4Name + modeName + std::to_string(axis); + UNI_STRCPY(kernelName, kernel.c_str()); if (useNchwFormat) { - sprintf(kernelOpt->sourceName, "reduction_nchw"); + UNI_STRCPY(kernelOpt->sourceName, "reduction_nchw"); } else { - sprintf(kernelOpt->sourceName, "reduction"); + UNI_STRCPY(kernelOpt->sourceName, "reduction"); } CHECK_STATUS(set_value_define_opt(axis, "AXIS", opt)); return SUCCESS; diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/rnncell_update_res_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/rnncell_update_res_opt.h index 3408dc6a..dfd90ee7 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/rnncell_update_res_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/rnncell_update_res_opt.h @@ -23,8 +23,9 @@ inline EE set_rnncell_update_res_opt_mali(bool useProjection, CHECK_STATUS(set_chars_define_opt("USE_RNN_MODE", opt)); } kernelOpt->kernelDataType = dt; - sprintf(kernelName, "rnncell_update_res_%s%s", proName.c_str(), modeName.c_str()); - sprintf(kernelOpt->sourceName, "rnncell_update_res"); + std::string kernel = std::string("rnncell_update_res_") + proName + modeName; + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "rnncell_update_res"); CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt)); return SUCCESS; } diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/roialign_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/roialign_opt.h index b2a0db6f..254bf29e 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/roialign_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/roialign_opt.h @@ -27,8 +27,9 @@ inline EE set_roialign_opt_mali(bool useNchwFormat, } else { CHECK_STATUS(NOT_SUPPORTED); } - sprintf(kernelName, "roialign_%s%s%s", ioMemName, formatName.c_str(), modeName.c_str()); - sprintf(kernelOpt->sourceName, "roialign"); + std::string kernel = std::string("roialign_") + ioMemName + formatName + modeName; + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "roialign"); if (useNchwFormat) { CHECK_STATUS(set_chars_define_opt("USE_NCHW", opt)); } diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/scale_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/scale_opt.h index 40d9466f..a78d264f 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/scale_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/scale_opt.h @@ -48,9 +48,10 @@ inline EE set_scale_opt_mali(bool useAlpha, char ioMemName[128] = ""; CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName)); - sprintf(kernelName, "scale_%s%s%s%s%s%s", ioMemName, formatName.c_str(), broadName.c_str(), - axisName.c_str(), alphaName.c_str(), betaName.c_str()); - sprintf(kernelOpt->sourceName, "scale"); + std::string kernel = std::string("scale_") + ioMemName + formatName + broadName + axisName + + alphaName + betaName; + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "scale"); CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt)); return SUCCESS; } diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/slice_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/slice_opt.h index 6a2bf5ca..6dc3a01e 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/slice_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/slice_opt.h @@ -4,13 +4,13 @@ inline EE set_slice_opt_mali( bool useNchwFormat, U32 axis, U32 slice_num, DataType dt, char *kernelName, KernelOpt *kernelOpt) { - std::string formatName = ""; + std::string name = "slice_"; if (useNchwFormat) { - formatName = "nchw_"; + name += "nchw_"; } - - sprintf(kernelName, "slice_%s%d%d", formatName.c_str(), axis, slice_num); - sprintf(kernelOpt->sourceName, "slice"); + name += std::to_string(axis) + std::to_string(slice_num); + UNI_STRCPY(kernelName, name.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "slice"); kernelOpt->kernelDataType = dt; char *opt = kernelOpt->option; CHECK_STATUS(set_value_define_opt(axis, "AXIS_NUM", opt)); diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/softmax_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/softmax_opt.h index 078a8018..62138184 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/softmax_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/softmax_opt.h @@ -15,8 +15,9 @@ inline EE set_softmax_opt_mali(U32 axis, if (useNchwFormat) { formatName = "nchw_"; } - sprintf(kernelName, "softmax_%s%s%d", ioMemName, formatName.c_str(), axis); - sprintf(kernelOpt->sourceName, "softmax"); + std::string kernel = std::string("softmax_") + ioMemName + formatName + std::to_string(axis); + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "softmax"); kernelOpt->kernelDataType = dt; char *opt = kernelOpt->option; CHECK_STATUS(set_value_define_opt(axis, "AXIS", opt)); @@ -83,9 +84,10 @@ inline EE set_softmax_vec_reduce_opt_mali(bool useNchwFormat, } } } - sprintf(kernelName, "softmax_vec_reduce_%s%s%s%s", ioMemName, formatName.c_str(), - inputAxis.c_str(), outputAxis.c_str()); - sprintf(kernelOpt->sourceName, "softmax_vec_reduce"); + std::string kernel = + std::string("softmax_vec_reduce_") + ioMemName + formatName + inputAxis + outputAxis; + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "softmax_vec_reduce"); kernelOpt->kernelDataType = dt; if (useNchwFormat) { CHECK_STATUS(set_chars_define_opt("USE_NCHW", opt)); diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/space2depth_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/space2depth_opt.h index e82e9df8..a7f38b7b 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/space2depth_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/space2depth_opt.h @@ -10,15 +10,15 @@ inline EE set_space2depth_opt(bool useFormatNchw, KernelOpt *kernelOpt) { char *opt = kernelOpt->option; - std::string formatName = ""; + std::string name = "space2depth"; if (useFormatNchw) { - formatName = "_nchw"; + name += "_nchw"; } char ioMemName[128] = ""; CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName)); kernelOpt->kernelDataType = DT_F16; - sprintf(kernelName, "space2depth%s", formatName.c_str()); - sprintf(kernelOpt->sourceName, "space2depth"); + UNI_STRCPY(kernelName, name.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "space2depth"); if (useFormatNchw) { CHECK_STATUS(set_chars_define_opt("USE_NCHW", opt)); } diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/tile_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/tile_opt.h index 2952260f..01c21ecd 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/tile_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/tile_opt.h @@ -12,8 +12,9 @@ inline EE set_tile_opt_mali(U32 nDims, char ioMemName[128] = ""; CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName)); kernelOpt->kernelDataType = dt; - sprintf(kernelName, "tile_%s%d", ioMemName, nDims); - sprintf(kernelOpt->sourceName, "tile"); + std::string kernel = std::string("tile_") + ioMemName + std::to_string(nDims); + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "tile"); CHECK_STATUS(set_value_define_opt(nDims, "DN", opt)); CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt)); return SUCCESS; diff --git a/compute/tensor/src/gpu/mali/cl/kernel_option/transpose_opt.h b/compute/tensor/src/gpu/mali/cl/kernel_option/transpose_opt.h index 2f3d1745..5c63e340 100644 --- a/compute/tensor/src/gpu/mali/cl/kernel_option/transpose_opt.h +++ b/compute/tensor/src/gpu/mali/cl/kernel_option/transpose_opt.h @@ -12,8 +12,9 @@ inline EE set_transpose_opt_mali(U32 nDims, char ioMemName[128] = ""; CHECK_STATUS(set_io_mem_name(inputMemType, outputMemType, ioMemName)); kernelOpt->kernelDataType = dt; - sprintf(kernelName, "transpose_nchw_%s%d", ioMemName, nDims); - sprintf(kernelOpt->sourceName, "transpose_nchw"); + std::string kernel = std::string("transpose_nchw_") + ioMemName + std::to_string(nDims); + UNI_STRCPY(kernelName, kernel.c_str()); + UNI_STRCPY(kernelOpt->sourceName, "transpose_nchw"); CHECK_STATUS(set_value_define_opt(nDims, "DN", opt)); CHECK_STATUS(set_io_mem_define_opt(inputMemType, outputMemType, opt)); return SUCCESS; diff --git a/compute/tensor/src/gpu/mali/cl/pooling.cl b/compute/tensor/src/gpu/mali/cl/pooling.cl index 158fd3b6..b6207663 100644 --- a/compute/tensor/src/gpu/mali/cl/pooling.cl +++ b/compute/tensor/src/gpu/mali/cl/pooling.cl @@ -72,6 +72,7 @@ __kernel void MANGLE_NAME(pooling_, IOM, PM)(const int iw_str, const int ph, const int kw, const int kh, + const int count_include_pad, READ_ONLY_KERNEL_MEM in, KERNEL_MEM out) { @@ -116,7 +117,7 @@ __kernel void MANGLE_NAME(pooling_, IOM, PM)(const int iw_str, ADD_IN_OFF } #if defined(USE_POOLING_MEAN) - float psize = (eh - bh) * (ew - bw); + float psize = count_include_pad ? (kh * kw) : ((eh - bh) * (ew - bw)); res = res / psize; #endif STORE_OUT; diff --git a/compute/tensor/src/gpu/mali/convolution.cpp b/compute/tensor/src/gpu/mali/convolution.cpp index 47e31470..7111f251 100644 --- a/compute/tensor/src/gpu/mali/convolution.cpp +++ b/compute/tensor/src/gpu/mali/convolution.cpp @@ -68,7 +68,7 @@ inline void convolution_produce_algos_paras(TensorDesc inputDesc, } algoNumIndex->push_back(vecH->size()); - if (fw == 3 && fh == 3 && sw == 1 && sh == 1 && dw == 1 && dh == 1 + if (fw == 3 && fh == 3 && ft == 1 && sw == 1 && sh == 1 && dw == 1 && dh == 1 && idf != DF_NCHW && odf != DF_NCHW && ic > 32 && fn >= 128 && ih > 64 && iw > 64) { convolutionAlgorithms->push_back(CONVOLUTION_ALGORITHM_WINOGRAD); @@ -76,6 +76,13 @@ inline void convolution_produce_algos_paras(TensorDesc inputDesc, get_gemm_tn_cal_scheme(vecH, vecC, vecK, mt, mt, GCL_MEM_BUF); algoNumIndex->push_back(vecH->size()); } + + if (sw == 1 && sh == 1 && dw == 1 && dh == 1 && fw * fh > 1 && ft == 1 + && idf != DF_NCHW && odf != DF_NCHW && ic > iw * 4 && ic > ih * 4) { + convolutionAlgorithms->push_back(CONVOLUTION_ALGORITHM_INVGEMM); + CHECK_STATUS(get_conv_direct_cal_scheme(vecH, vecC, vecK, 1, 1, fn)); + algoNumIndex->push_back(vecH->size()); + } } inline void infer_align_val(ConvolutionForwardAlgorithm algo, @@ -214,6 +221,14 @@ EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle, if (policy == CONVOLUTION_FASTEST) { CHECK_STATUS(NOT_SUPPORTED); } + GCLMemType imt = inputMemDesc.memType; + GCLMemType omt = outputMemDesc.memType; + std::vector filterDescVec(1, filterDesc); + std::vector flag = build_conv_forward_algorithm_flag( + inputDesc, filterDescVec, OT_Conv, imt, omt, convParamSpec); + if (gcl_get_runInfo_from_cache(handle, flag, forwardRunInfo)) { + return SUCCESS; + } DataType dt; U32 ic, ih, iw, fn, fh, fw, ft; tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, &ih, &iw); @@ -230,8 +245,6 @@ EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle, std::vector vecK; DataFormat idf = inputDesc.df; DataFormat odf = outputDesc.df; - GCLMemType imt = inputMemDesc.memType; - GCLMemType omt = outputMemDesc.memType; convolution_produce_algos_paras(inputDesc, filterDesc, convParamSpec, idf, odf, imt, omt, &convolutionAlgorithms, &algoNumIndex, &vecH, &vecC, &vecK); if (vecH.size() == 1) { @@ -328,13 +341,19 @@ EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle, gcl_create_memory(handle, filter); gcl_create_memory(handle, bias); gcl_create_memory(handle, biasbuf); - std::vector tmpDir(3, NULL); + std::vector tmpDir(1, NULL); + std::vector tmpInv(1, NULL); std::vector tmpWino(3, NULL); - std::vector tmp; - tmpbuf->desc.byteSize = maxBytes[0] + 1; + std::vector tmp(3, NULL); + if (maxBytes[0]) { + tmpbuf->desc.byteSize = maxBytes[0]; + } else { + tmpbuf->desc.byteSize = 128; + } gcl_create_memory(handle, tmpbuf); tmpDir[0] = tmpbuf; tmpWino[0] = tmpbuf; + tmpInv[0] = tmpbuf; if (check_qualcomm_device() && maxBytes[1] > 0 && maxBytes[2] > 0 && maxBytes[3] > 0) { tmpImgA->desc.memType = GCL_MEM_IMG_3D; @@ -355,16 +374,19 @@ EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle, tmpWino[2] = tmpImgB; } - double minTimeWinograd = DBL_MAX; double minTime = DBL_MAX; + double minTimeWinograd = DBL_MAX; double winogradPicTranTime = DBL_MAX; double winogradOutTranTime = DBL_MAX; + double minTimeInvGemm = DBL_MAX; + double invGemmCol2ImgTime = DBL_MAX; U32 runKernelBe = 0; U32 runKernelEnd = 0; ForwardRunInfoMali bestRunInfo; ForwardRunInfoMali bestRunInfoWinograd; + ForwardRunInfoMali bestRunInfoInvGemm; GCLMem_t fltMem = filter; - tmp = tmpDir; + tmp[0] = tmpDir[0]; for (U32 i = 0; i < algosNum; i++) { GCLMem_t biasMem = (runInfos[i].best_k[0] == 0) ? biasbuf : bias; if (check_qualcomm_device()) { @@ -376,14 +398,22 @@ EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle, break; } } - if (runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_WINOGRAD) { + if (runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_DIRECT) { + fltMem = filter; + tmp[0] = tmpDir[0]; + } else if (runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_INVGEMM) { + fltMem = filter; + tmp[0] = tmpInv[0]; + } else if (runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_WINOGRAD) { if (useWinoFltImg) { gcl_create_memory(handle, filterImg); useWinoFltImg = false; - fltMem = filterImg; } - tmp = tmpWino; - } + fltMem = filterImg; + for (U32 i = 0; i < 3; i++) { + tmp[i] = tmpWino[i]; + } + } } if (convolution_mali(handle, inputDesc, input, filterDesc, fltMem, convParamSpec, &runInfos[i], scaleDesc, NULL, biasDesc, biasMem, maxBytes[0], tmp, outputDesc, @@ -391,11 +421,11 @@ EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle, if (runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_DIRECT) { runKernelEnd = handle->kernelVec->size(); gcl_run_kernelVec_timing(handle, runKernelEnd - 1, runKernelEnd); - runKernelBe = runKernelEnd; if (minTime > handle->t_execute) { minTime = handle->t_execute; bestRunInfo = runInfos[i]; } + runKernelBe = runKernelEnd; } if (runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_WINOGRAD) { @@ -416,6 +446,19 @@ EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle, } runKernelBe = runKernelEnd; } + if (runInfos[i].algorithm == (I32)CONVOLUTION_ALGORITHM_INVGEMM) { + runKernelEnd = handle->kernelVec->size(); + if (invGemmCol2ImgTime == DBL_MAX) { + gcl_run_kernelVec_timing(handle, runKernelEnd - 1, runKernelEnd); + invGemmCol2ImgTime = handle->t_execute; + } + gcl_run_kernelVec_timing(handle, runKernelEnd - 2, runKernelEnd - 1); + if (minTimeInvGemm > handle->t_execute) { + minTimeInvGemm = handle->t_execute; + bestRunInfoInvGemm = runInfos[i]; + } + runKernelBe = runKernelEnd; + } } } @@ -426,10 +469,18 @@ EE convolution_infer_forward_algorithm_mali(GCLHandle_t handle, minTime = minTimeWinograd; bestRunInfo = bestRunInfoWinograd; } + if (minTimeInvGemm != DBL_MAX) { + minTimeInvGemm = minTimeInvGemm + invGemmCol2ImgTime; + } + if (minTimeInvGemm < minTime) { + minTime = minTimeInvGemm; + bestRunInfo = bestRunInfoInvGemm; + } if (minTime == DBL_MAX) { CHECK_STATUS(NOT_SUPPORTED); } *forwardRunInfo = bestRunInfo; + gcl_set_runInfo_to_cache(handle, flag, bestRunInfo); CHECK_STATUS(gcl_finish(handle)); gcl_destroy_gclmem(input); gcl_destroy_gclmem(filter); diff --git a/compute/tensor/src/gpu/mali/deconvolution.cpp b/compute/tensor/src/gpu/mali/deconvolution.cpp index 4045344f..ae7d23c9 100644 --- a/compute/tensor/src/gpu/mali/deconvolution.cpp +++ b/compute/tensor/src/gpu/mali/deconvolution.cpp @@ -98,6 +98,14 @@ EE deconvolution_infer_forward_algorithm_mali(GCLHandle_t handle, if (algorithm != CONVOLUTION_ALGORITHM_NULL) { return SUCCESS; } + GCLMemType imt = inputMemDesc.memType; + GCLMemType omt = outputMemDesc.memType; + std::vector filterDescVec(1, filterDesc); + std::vector flag = build_conv_forward_algorithm_flag( + inputDesc, filterDescVec, OT_Deconvolution, imt, omt, convParamSpec); + if (gcl_get_runInfo_from_cache(handle, flag, forwardRunInfo)) { + return SUCCESS; + } DataType dt; U32 ih, iw, fc, fh, fw; tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); @@ -210,6 +218,7 @@ EE deconvolution_infer_forward_algorithm_mali(GCLHandle_t handle, CHECK_STATUS(NOT_SUPPORTED); } *forwardRunInfo = bestRunInfo; + gcl_set_runInfo_to_cache(handle, flag, bestRunInfo); CHECK_STATUS(gcl_finish(handle)); gcl_destroy_gclmem(input); gcl_destroy_gclmem(filter); diff --git a/compute/tensor/src/gpu/mali/depth2space.cpp b/compute/tensor/src/gpu/mali/depth2space.cpp index efb113b9..97b331e7 100644 --- a/compute/tensor/src/gpu/mali/depth2space.cpp +++ b/compute/tensor/src/gpu/mali/depth2space.cpp @@ -49,14 +49,14 @@ EE depth2space_padding_input_mali(TensorDesc inputDesc, U32 ow, oh, oc, on; tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); on = in; - oc = ic / (p.blockSize * p.blockSize); - oh = ih * p.blockSize; - ow = iw * p.blockSize; - if (ic % (p.blockSize * p.blockSize) != 0) { + oc = ic / (p.block_size * p.block_size); + oh = ih * p.block_size; + ow = iw * p.block_size; + if (ic % (p.block_size * p.block_size) != 0) { return NOT_MATCH; } DataFormat odf = idf; - if ((p.blockSize == 2 && oc < 4) || p.blockSize != 2) { + if ((p.block_size == 2 && oc < 4) || p.block_size != 2) { odf = DF_NCHW; } *outputDesc = tensor4df(idt, odf, on, oc, oh, ow); diff --git a/compute/tensor/src/gpu/mali/depthwise_convolution.cpp b/compute/tensor/src/gpu/mali/depthwise_convolution.cpp index fcfcfef5..f5bbcaff 100644 --- a/compute/tensor/src/gpu/mali/depthwise_convolution.cpp +++ b/compute/tensor/src/gpu/mali/depthwise_convolution.cpp @@ -103,6 +103,14 @@ EE depthwise_convolution_infer_forward_algorithm_mali(GCLHandle_t handle, if (policy == CONVOLUTION_FASTEST) { CHECK_STATUS(NOT_SUPPORTED); } + GCLMemType imt = inputMemDesc.memType; + GCLMemType omt = outputMemDesc.memType; + std::vector filterDescVec(1, filterDesc); + std::vector flag = build_conv_forward_algorithm_flag( + inputDesc, filterDescVec, OT_Conv, imt, omt, convParamSpec); + if (gcl_get_runInfo_from_cache(handle, flag, forwardRunInfo)) { + return SUCCESS; + } U32 dw = convParamSpec.dilatedRate_w; U32 dh = convParamSpec.dilatedRate_h; std::vector depthwiseConvAlgorithms; @@ -217,6 +225,7 @@ EE depthwise_convolution_infer_forward_algorithm_mali(GCLHandle_t handle, CHECK_STATUS(NOT_SUPPORTED); } *forwardRunInfo = bestRunInfo; + gcl_set_runInfo_to_cache(handle, flag, bestRunInfo); CHECK_STATUS(gcl_finish(handle)); gcl_destroy_gclmem(input); gcl_destroy_gclmem(filter); diff --git a/compute/tensor/src/gpu/mali/depthwise_pointwise_convolution.cpp b/compute/tensor/src/gpu/mali/depthwise_pointwise_convolution.cpp index 8f66eab3..1cb308e6 100644 --- a/compute/tensor/src/gpu/mali/depthwise_pointwise_convolution.cpp +++ b/compute/tensor/src/gpu/mali/depthwise_pointwise_convolution.cpp @@ -110,12 +110,12 @@ EE depthwise_pointwise_convolution_padding_input_mali(TensorDesc inputDesc, } ih_align *= sh; U32 fhd = (fh - 1) * dh + 1; - U32 pl = convParamSpec.padding_left; - U32 pr = convParamSpec.padding_right; - U32 pt = convParamSpec.padding_top; + U32 pl = convParamSpec.pad_left; + U32 pr = convParamSpec.pad_right; + U32 pt = convParamSpec.pad_top; U32 pb = ih_align + (fhd / 2 * 2) - pt - ih; - if (pb < convParamSpec.padding_bottom) { - pb = convParamSpec.padding_bottom; + if (pb < convParamSpec.pad_bottom) { + pb = convParamSpec.pad_bottom; } inputMem->padding(pl, pr, pt, pb); } @@ -149,6 +149,14 @@ EE depthwise_pointwise_convolution_infer_forward_algorithm_mali(GCLHandle_t hand if (policy == CONVOLUTION_FASTEST) { CHECK_STATUS(NOT_SUPPORTED); } + GCLMemType imt = inputMemDesc.memType; + GCLMemType omt = outputMemDesc.memType; + std::vector filterDescVec = {dwFilterDesc, pwFilterDesc}; + std::vector flag = build_conv_forward_algorithm_flag( + inputDesc, filterDescVec, OT_Conv, imt, omt, convParamSpec); + if (gcl_get_runInfo_from_cache(handle, flag, forwardRunInfo)) { + return SUCCESS; + } std::vector depthwisePointwiseConvAlgorithms; std::vector algoNumIndexD; std::vector vecHD; @@ -372,6 +380,7 @@ EE depthwise_pointwise_convolution_infer_forward_algorithm_mali(GCLHandle_t hand } *forwardRunInfo = bestRunInfo[0]; + gcl_set_runInfo_to_cache(handle, flag, bestRunInfo[0]); CHECK_STATUS(gcl_finish(handle)); gcl_destroy_gclmem(input); gcl_destroy_gclmem(dwFilter); diff --git a/compute/tensor/src/gpu/mali/eltwise.cpp b/compute/tensor/src/gpu/mali/eltwise.cpp index 80c9035b..e81557b1 100644 --- a/compute/tensor/src/gpu/mali/eltwise.cpp +++ b/compute/tensor/src/gpu/mali/eltwise.cpp @@ -65,7 +65,7 @@ inline EE eltwise_checkpara_mali(GCLHandle_t handle, CHECK_STATUS(NULL_POINTER); } } - EltwiseMode eltwiseMode = eltwiseDesc.elt_mode; + EltwiseMode eltwiseMode = eltwiseDesc.mode; U32 arrayDimMax = 0; bool sameDesc = eltwise_same_desc(inputDesc, &arrayDimMax); if (sameDesc) { diff --git a/compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.cpp index 81fc3f13..05dc9d42 100644 --- a/compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/activation_mali_fp16.cpp @@ -29,8 +29,6 @@ inline EE activation_core_mali_fp16(GCLHandle_t handle, GCLMem_t output, ActivationMode activationMode) { - UNUSED(inputDesc); - UNUSED(outputDesc); U32 ow, oh, oc, on; U32 iw_str, ih_str, iw_off, ih_off; U32 ow_str, oh_str, ow_off, oh_off; diff --git a/compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.cpp index 7f87ea2d..5e29c17b 100644 --- a/compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/argmax_mali_fp16.cpp @@ -64,10 +64,8 @@ inline EE argmax_core_mali_fp16(GCLHandle_t handle, Mem inv1 = input->mem; Mem ini1 = input->mem; Mem outv1024, outi1024, outv128, outi128; - char kernelName[128]; - char kernelNameIndex[128]; - sprintf(kernelName, "argmax_x"); - sprintf(kernelNameIndex, "argmax_x_index"); + const char *kernelName = "argmax_x"; + const char *kernelNameIndex = "argmax_x_index"; bool use_index = false; U32 offset = 0; U32 len = iw; diff --git a/compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.cpp index 1564e6f3..7b45135a 100644 --- a/compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/bilateral_slice_apply_mali_fp16.cpp @@ -47,7 +47,7 @@ inline EE bilateral_slice_apply_core_mali_fp16(GCLHandle_t handle, tensorSelectGet(gridDesc, NULL, NULL, &gn, &gc, &gh, &gw); tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); - U32 coe = bilateralSliceApplyParamSpec.coefficient_len; + U32 coe = bilateralSliceApplyParamSpec.coefficient; BilateralSliceApplyMode mode = bilateralSliceApplyParamSpec.mode; // bool has_offset = bilateralSliceApplyParamSpec.has_offset; U32 dep = gc / coe; @@ -60,7 +60,7 @@ inline EE bilateral_slice_apply_core_mali_fp16(GCLHandle_t handle, gridbuf = grid->mem; outbuf = output->mem; gridTran = tmpBuf->mem; - if (mode == BSliceApply_NULL) { + if (mode == BSLICE_APPLY_NULL) { guidebuf = guide->mem; } else { guidebuf = inbuf; @@ -80,11 +80,11 @@ inline EE bilateral_slice_apply_core_mali_fp16(GCLHandle_t handle, gcl_run_kernel_profiling(handle, kernel, dim0, gs0, ls0, "bilateral_slice_apply_pre")); CHECK_STATUS(gcl_print_memory(handle, grid, "bilateral_slice_apply_grid")); #endif - char kernelname[128]; - if (mode == BSliceApply_CONV) { - sprintf(kernelname, "bilateral_slice_apply_c12_conv"); + const char *kernelname; + if (mode == BSLICE_APPLY_CONV) { + kernelname = "bilateral_slice_apply_c12_conv"; } else { - sprintf(kernelname, "bilateral_slice_apply_c12"); + kernelname = "bilateral_slice_apply_c12"; } U32 gs[2] = {ow, oh}; U32 ls[2] = {0, 0}; @@ -98,7 +98,7 @@ inline EE bilateral_slice_apply_core_mali_fp16(GCLHandle_t handle, CHECK_STATUS(gcl_run_kernel_profiling(handle, kernel, dim, gs, ls, kernelname)); CHECK_STATUS(gcl_print_memory(handle, input, "bilateral_slice_apply_input")); CHECK_STATUS(gcl_print_memory(handle, output, "bilateral_slice_apply_output")); - if (mode == BSliceApply_NULL) { + if (mode == BSLICE_APPLY_NULL) { CHECK_STATUS(gcl_print_memory(handle, guide, "bilateral_slice_apply_guide")); } #endif diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.cpp index 8176c0d9..86658cfd 100644 --- a/compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/convolution_direct_mali_fp16.cpp @@ -17,20 +17,29 @@ #include "gpu/mali/cl/kernel_option/conv_direct_opt.h" #include "gpu/mali/cl/kernel_option/gemv_opt.h" -inline TensorDesc get_nchw_desc_for_img(TensorDesc inputDesc, ConvolutionParamSpec convParamSpec) { +inline TensorDesc get_nchw_desc_for_img(TensorDesc inputDesc, ConvolutionParamSpec convParamSpec) +{ TensorDesc desc = inputDesc; - desc.dims[0] += convParamSpec.padding_left + convParamSpec.padding_right; - desc.dims[1] += convParamSpec.padding_bottom; + desc.dims[0] += convParamSpec.pad_left + convParamSpec.pad_right; + desc.dims[1] += convParamSpec.pad_bottom; return desc; } -inline EE trans_input_nchw_to_img(GCLHandle_t handle, TensorDesc inputDesc, GCLMem_t input, - ConvolutionParamSpec convParamSpec, GCLMem_t tmp, U32 *iw_str, U32 *ih_str, I32 *iw_off, I32 *ih_off) { +inline EE trans_input_nchw_to_img(GCLHandle_t handle, + TensorDesc inputDesc, + GCLMem_t input, + ConvolutionParamSpec convParamSpec, + GCLMem_t tmp, + U32 *iw_str, + U32 *ih_str, + I32 *iw_off, + I32 *ih_off) +{ TensorDesc descNchwImg = get_nchw_desc_for_img(inputDesc, convParamSpec); GCLMem inputTran = *input; - inputTran.desc.dims[0] = descNchwImg.dims[0];//move left padding zero into img + inputTran.desc.dims[0] = descNchwImg.dims[0]; //move left padding zero into img inputTran.desc.dims[1] = descNchwImg.dims[1]; - inputTran.desc.offset[0] -= convParamSpec.padding_left; + inputTran.desc.offset[0] -= convParamSpec.pad_left; if (inputTran.desc.offset[0] < 0) { CHECK_STATUS(NOT_MATCH); } @@ -47,7 +56,7 @@ inline EE trans_input_nchw_to_img(GCLHandle_t handle, TensorDesc inputDesc, GCLM *iw_str = inputImg.desc.stride[0]; *ih_str = inputImg.desc.stride[1]; *iw_off = 0; - *ih_off = -convParamSpec.padding_top; + *ih_off = -convParamSpec.pad_top; return SUCCESS; } @@ -84,9 +93,9 @@ inline EE direct_core_nchw_to_nchwc4_mali_fp16(GCLHandle_t handle, sw = convParamSpec.stride_w; sh = convParamSpec.stride_h; st = convParamSpec.stride_t; - ph = convParamSpec.padding_top; - pw = convParamSpec.padding_left; - pt = convParamSpec.padding_before; + ph = convParamSpec.pad_top; + pw = convParamSpec.pad_left; + pt = convParamSpec.pad_before; tensorSelectGet(inputDesc, NULL, &df, NULL, &ic, &ih, &iw, &it); tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow, &ot); @@ -104,14 +113,14 @@ inline EE direct_core_nchw_to_nchwc4_mali_fp16(GCLHandle_t handle, o_off = oh_off * ow_str + ow_off; if (tmpBuf->desc.memType != GCL_MEM_BUF) { - CHECK_STATUS(trans_input_nchw_to_img(handle, inputDesc, input, convParamSpec, - tmpBuf, &iw_str, &ih_str, &iw_off, &ih_off)); + CHECK_STATUS(trans_input_nchw_to_img( + handle, inputDesc, input, convParamSpec, tmpBuf, &iw_str, &ih_str, &iw_off, &ih_off)); iwh_str = iw_str * ih_str; inbuf = tmpBuf->mem; imt = tmpBuf->desc.memType; } - U32 item_w = forwardRunInfo->best_h[0];//for nchw, reuse on w + U32 item_w = forwardRunInfo->best_h[0]; //for nchw, reuse on w char kernelName[128]; KernelOpt kernelOpt; Kernel kernel; @@ -125,8 +134,7 @@ inline EE direct_core_nchw_to_nchwc4_mali_fp16(GCLHandle_t handle, CHECK_STATUS(NOT_SUPPORTED); } CHECK_STATUS(set_conv_direct_nchw_to_nchwc4_opt_mali( - fw, fh, ft, sw, item_w, activationMode, - DT_F16, imt, omt, kernelName, &kernelOpt)); + fw, fh, ft, sw, item_w, activationMode, DT_F16, imt, omt, kernelName, &kernelOpt)); CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt)); if (ot > 1) { CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, iwh_str, ic_str, iw_off, ih_off, ow_str, @@ -177,8 +185,8 @@ inline EE direct_core_fn_spe(GCLHandle_t handle, fh = convParamSpec.kernel_h; sw = convParamSpec.stride_w; sh = convParamSpec.stride_h; - ph = convParamSpec.padding_top; - pw = convParamSpec.padding_left; + ph = convParamSpec.pad_top; + pw = convParamSpec.pad_left; tensorSelectGet(inputDesc, NULL, NULL, NULL, NULL, &ih, &iw); tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); fn = oc; @@ -269,9 +277,9 @@ inline EE direct_core_mali_fp16(GCLHandle_t handle, sw = convParamSpec.stride_w; sh = convParamSpec.stride_h; st = convParamSpec.stride_t; - ph = convParamSpec.padding_top; - pw = convParamSpec.padding_left; - pt = convParamSpec.padding_before; + ph = convParamSpec.pad_top; + pw = convParamSpec.pad_left; + pt = convParamSpec.pad_before; tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw, &it); tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow, &ot); if (on > 1 && ot > 1) { @@ -389,8 +397,8 @@ inline EE direct_dila_core_mali_fp16(GCLHandle_t handle, fh = convParamSpec.kernel_h; sw = convParamSpec.stride_w; sh = convParamSpec.stride_h; - pw = convParamSpec.padding_left; - ph = convParamSpec.padding_top; + pw = convParamSpec.pad_left; + ph = convParamSpec.pad_top; dw = convParamSpec.dilatedRate_w; dh = convParamSpec.dilatedRate_h; tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); @@ -572,7 +580,8 @@ EE convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, bool useGemvMode = useGemvCalMode(inputDesc, convParamSpec, GCL_MEM_BUF, GCL_MEM_BUF); bool useNchwMode = useNchwCalMode(idf, fw, ic, dw, dh); if (useGemvMode) { - CHECK_STATUS(gemv_infer_forward_tmp_bytes_mali_fp16(inputDesc, outputDesc, bytes, forwardRunInfo)); + CHECK_STATUS( + gemv_infer_forward_tmp_bytes_mali_fp16(inputDesc, outputDesc, bytes, forwardRunInfo)); } else if (useNchwMode) { bool useImg = check_qualcomm_device(); if (useImg) { @@ -589,9 +598,9 @@ EE convolution_direct_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, bytes[3] = depth; } } - } else if (idf == DF_NCHW) {//use tran c1 to c4 - GCLMemDesc desc = convolution_get_input_nchwc4_desc(inputDesc, filterDesc, - convParamSpec, outputDesc, useNchwMode, forwardRunInfo); + } else if (idf == DF_NCHW) { //use tran c1 to c4 + GCLMemDesc desc = convolution_get_input_nchwc4_desc( + inputDesc, filterDesc, convParamSpec, outputDesc, useNchwMode, forwardRunInfo); if (desc.memType == GCL_MEM_IMG_3D) { bytes[1] = desc.stride[0]; bytes[2] = desc.stride[1]; diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_invgemm_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/convolution_invgemm_mali_fp16.cpp new file mode 100644 index 00000000..558f2302 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/convolution_invgemm_mali_fp16.cpp @@ -0,0 +1,217 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "gpu/mali/fp16/convolution_mali_fp16.h" +#include "gpu/mali/fp16/convolution_invgemm_mali_fp16.h" +#include "gpu/mali/cl/kernel_option/conv_invgemm_opt.h" +#include "gpu/mali/cl/kernel_option/conv_direct_opt.h" + +inline EE invgemm_core_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + cl_mem inbuf, biasmem, outbuf, fltbuf, tmp; + inbuf = input->mem; + fltbuf = filter->mem; + biasmem = bias->mem; + outbuf = output->mem; + tmp = tmpBuf->mem; + U32 iw, ih, ic, in; + U32 fw, fh, sw, sh, pl, pt; + U32 ow, oh, oc, on; + fw = convParamSpec.kernel_w; + fh = convParamSpec.kernel_h; + sw = convParamSpec.stride_w; + sh = convParamSpec.stride_h; + pl = convParamSpec.pad_left; + pt = convParamSpec.pad_top; + tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw); + tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); + U32 item_h = forwardRunInfo->best_h[0]; + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + item_k = item_k >> 2; + + U32 iw_str, ih_str, ihw_str, ic_str, iw_off, ih_off, in_str; + get_gclmem_dim(input->desc, &iw_str, &ih_str, &ic_str, &iw_off, &ih_off); + U32 i_off = ih_off * iw_str + iw_off; + ihw_str = ih_str * iw_str; + ic_str = (ic + item_c - 1) / item_c; + in_str = ihw_str * ic_str; + U32 ow_str, oh_str, ow_off, oh_off, o_off; + get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + o_off = oh_off * ow_str + ow_off; + + char kernelName[128]; + KernelOpt kernelOpt; + U32 gs[3]; + U32 ls[3] = {0, 0, 0}; + U32 dim; + Kernel kernel; + if (sw == 1 && sh == 1) { + U32 tw = iw; + U32 th = ih; + U32 tc = fw * fh * ((oc + 3) / 4 * 4); + U32 tw_str = iw; + U32 th_str = ih; + U32 t_off = 0; + U32 thw_str = tw_str * th_str; + U32 tn_str = thw_str * ((tc + 3) / 4); + gs[0] = tw; + gs[1] = (th + item_h - 1) / item_h; + gs[2] = tc / 4 / item_k * on; + dim = 3; + CHECK_STATUS(set_conv_direct_opt_mali(1, 1, 1, 1, item_h, item_k, true, ACTIVATION_NULL, + DT_F16, input->desc.memType, GCL_MEM_BUF, kernelName, &kernelOpt)); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ihw_str, ic_str, iw_off, ih_off, tw_str, + thw_str, t_off, th, tc, sw, in_str, tn_str, gs[0], gs[1], inbuf, fltbuf, biasmem, tmp)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + handle->t_total += handle->t_execute; +#endif + gs[0] = ow; + gs[1] = oh; + gs[2] = (oc + 3) / 4 * on; + I32 pw = fw - 1 - pl; + I32 ph = fh - 1 - pt; + CHECK_STATUS(set_conv_invgemm_col2img_opt( + activationMode, DT_F16, GCL_MEM_BUF, output->desc.memType, kernelName, &kernelOpt)); + CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, iw, ih, fw, fh, pw, ph, ow_str, oh_str, o_off, oc, + gs[0], gs[1], tmp, biasmem, outbuf)); + gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); +#ifdef _DEBUG + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + handle->t_total += handle->t_execute; +#endif + } else { + CHECK_STATUS(NOT_SUPPORTED); + } + return SUCCESS; +} + +inline TensorDesc transform_filter_desc(TensorDesc filterDesc, U32 item_c, U32 item_k) +{ + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, NULL, NULL, &fn, &fc, &fh, &fw); + TensorDesc desc; + desc.df = DF_NCHW; + desc.dt = DT_F16; + desc.nDims = 4; + desc.dims[0] = item_k * item_c; + desc.dims[1] = (fc + item_c - 1) / item_c; + desc.dims[2] = (fn + item_k - 1) / item_k * fw * fh; + desc.dims[3] = 1; + return desc; +} + +EE convolution_invgemm_transform_filter_bytes_mali_fp16( + TensorDesc filterDesc, ForwardRunInfoMali_t forwardRunInfo, TensorDesc *ftmDesc) +{ + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + *ftmDesc = transform_filter_desc(filterDesc, item_c, item_k); + return SUCCESS; +} + +EE convolution_invgemm_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem) +{ + DataType fdt; + DataFormat fdf; + U32 fw, fh, fc, fn; + tensorSelectGet(filterDesc, &fdt, &fdf, &fn, &fc, &fh, &fw); + U32 fwh = fw * fh; + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + char kernelName[128]; + KernelOpt kernelOpt; + Kernel kernel; + U32 gs[3] = {0, 0, 0}; + U32 ls[3] = {0, 0, 0}; + U32 dim = 3; + CHECK_STATUS(set_conv_invgemm_trans_flt_opt(item_k, DT_F16, kernelName, &kernelOpt)); + gs[0] = fwh; + gs[1] = (fc + item_c - 1) / item_c; + gs[2] = (fn + item_k - 1) / item_k * item_k; + CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelName, &kernel, &kernelOpt)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, fw, fh, fwh, fc, fn, filter->mem, fltmem->mem)); + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + *fltmemDesc = transform_filter_desc(filterDesc, item_c, item_k); + return SUCCESS; +} + +EE convolution_invgemm_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes) +{ + DataType dt = inputDesc.dt; + U32 iw = inputDesc.dims[0]; + U32 ih = inputDesc.dims[1]; + U32 fw = convParamSpec.kernel_w; + U32 fh = convParamSpec.kernel_h; + U32 oc = outputDesc.dims[outputDesc.nDims - 2]; + U32 on = outputDesc.dims[outputDesc.nDims - 1]; + U32 bufSize = 0; + U32 item_c = forwardRunInfo->best_c[0]; + U32 item_k = forwardRunInfo->best_k[0]; + + U32 tw = iw; + U32 th = ih; + U32 tc = fw * fh * ((oc + 3) / 4 * 4); + U32 tn = on; + bufSize = tw * th * tc * tn * bytesOf(dt); + *bytes = bufSize; + return SUCCESS; +} + +EE convolution_invgemm_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode) +{ + CHECK_STATUS(fill_output_zero(handle, output, outputDesc)); + CHECK_STATUS(invgemm_core_mali_fp16(handle, inputDesc, input, filterDesc, filter, convParamSpec, + forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, activationMode)); + return SUCCESS; +} diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_invgemm_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/convolution_invgemm_mali_fp16.h new file mode 100644 index 00000000..49dcf1a0 --- /dev/null +++ b/compute/tensor/src/gpu/mali/fp16/convolution_invgemm_mali_fp16.h @@ -0,0 +1,50 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_CONVOLUTION_INVGEMM_MALI_FP16 +#define _H_CONVOLUTION_INVGEMM_MALI_FP16 + +#include "gpu/mali/fp16/tensor_computing_fp16.h" + +EE convolution_invgemm_transform_filter_bytes_mali_fp16( + TensorDesc filterDesc, ForwardRunInfoMali_t forwardRunInfo, TensorDesc *ftmDesc); + +EE convolution_invgemm_transform_filter_mali_fp16(GCLHandle_t handle, + TensorDesc filterDesc, + GCLMem_t filter, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc *fltmemDesc, + GCLMem_t fltmem); + +EE convolution_invgemm_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, + TensorDesc filterDesc, + TensorDesc outputDesc, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + U32 *bytes); + +EE convolution_invgemm_mali_fp16(GCLHandle_t handle, + TensorDesc inputDesc, + const GCLMem_t input, + TensorDesc filterDesc, + const GCLMem_t filter, + ConvolutionParamSpec convParamSpec, + ForwardRunInfoMali_t forwardRunInfo, + TensorDesc biasDesc, + const GCLMem_t bias, + U32 tmpBytes, + GCLMem_t tmpBuf, + TensorDesc outputDesc, + GCLMem_t output, + ActivationMode activationMode); +#endif diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.cpp index c4508530..ee1fbcf5 100644 --- a/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.cpp @@ -14,6 +14,7 @@ #include "gpu/mali/fp16/convolution_mali_fp16.h" #include "gpu/mali/fp16/convolution_direct_mali_fp16.h" #include "gpu/mali/fp16/convolution_wino_mali_fp16.h" +#include "gpu/mali/fp16/convolution_invgemm_mali_fp16.h" inline EE convolution_checkpara_mali_fp16(GCLHandle_t handle, TensorDesc inputDesc, @@ -61,6 +62,10 @@ EE convolution_transform_filter_bytes_mali_fp16( ret = convolution_wino_transform_filter_bytes_mali_fp16( filterDesc, forwardRunInfo, ftmDesc); break; + case CONVOLUTION_ALGORITHM_INVGEMM: + ret = convolution_invgemm_transform_filter_bytes_mali_fp16( + filterDesc, forwardRunInfo, ftmDesc); + break; default: ret = NOT_SUPPORTED; break; @@ -90,6 +95,10 @@ EE convolution_transform_filter_mali_fp16(GCLHandle_t handle, ret = convolution_wino_transform_filter_mali_fp16( handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem, tmp); break; + case CONVOLUTION_ALGORITHM_INVGEMM: + ret = convolution_invgemm_transform_filter_mali_fp16( + handle, filterDesc, filter, forwardRunInfo, fltmemDesc, fltmem); + break; default: ret = NOT_SUPPORTED; break; @@ -118,6 +127,10 @@ EE convolution_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, ret = convolution_wino_infer_forward_tmp_bytes_mali_fp16( inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); break; + case CONVOLUTION_ALGORITHM_INVGEMM: + ret = convolution_invgemm_infer_forward_tmp_bytes_mali_fp16( + inputDesc, filterDesc, outputDesc, convParamSpec, forwardRunInfo, bytes); + break; default: ret = NOT_SUPPORTED; break; @@ -158,6 +171,11 @@ EE convolution_mali_fp16(GCLHandle_t handle, convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf, outputDesc, output, activationMode); break; + case CONVOLUTION_ALGORITHM_INVGEMM: + ret = convolution_invgemm_mali_fp16(handle, inputDesc, input, filterDesc, filter, + convParamSpec, forwardRunInfo, biasDesc, bias, tmpBytes, tmpBuf[0], outputDesc, output, + activationMode); + break; default: ret = NOT_SUPPORTED; break; diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.h index b664d39e..bd1ae9a9 100644 --- a/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/convolution_mali_fp16.h @@ -16,11 +16,7 @@ #include "gpu/mali/fp16/tensor_computing_fp16.h" #include "gpu/mali/cl/kernel_option/common_opt.h" -inline bool useNchwCalMode(DataFormat idf, - U32 fw, - U32 ic, - U32 dw, - U32 dh) +inline bool useNchwCalMode(DataFormat idf, U32 fw, U32 ic, U32 dw, U32 dh) { bool useNchwMode = false; bool qualCommDev = check_qualcomm_device(); @@ -54,8 +50,18 @@ inline bool useGemvCalMode( } inline void calPaddingVal(TensorDesc inputDesc, - TensorDesc filterDesc, ConvolutionParamSpec convParamSpec, U32 w_align, U32 h_align, U32 n_align, - bool useNchwMode, U32 *pl, U32 *pr, U32 *pt, U32 *pb, U32 *pa, U32 *pf) + TensorDesc filterDesc, + ConvolutionParamSpec convParamSpec, + U32 w_align, + U32 h_align, + U32 n_align, + bool useNchwMode, + U32 *pl, + U32 *pr, + U32 *pt, + U32 *pb, + U32 *pa, + U32 *pf) { U32 iw, ih, ic, it, in; tensorSelectGet(inputDesc, NULL, NULL, &in, &ic, &ih, &iw, &it); @@ -63,10 +69,10 @@ inline void calPaddingVal(TensorDesc inputDesc, U32 fh = convParamSpec.kernel_h; U32 sh = convParamSpec.stride_h; U32 dh = convParamSpec.dilatedRate_h; - U32 fhd = (fh - 1) * dh + 1; + U32 fhd = (fh - 1) * dh + 1; h_align *= sh; - plv = convParamSpec.padding_left; - ptv = convParamSpec.padding_top; + plv = convParamSpec.pad_left; + ptv = convParamSpec.pad_top; if (useNchwMode) { U32 fw = convParamSpec.kernel_w; U32 sw = convParamSpec.stride_w; @@ -74,18 +80,18 @@ inline void calPaddingVal(TensorDesc inputDesc, U32 fwd = (fw - 1) * dw + 1; w_align *= sw; prv = w_align + (fwd / 2 * 2) - plv - iw; - if (prv < convParamSpec.padding_right) { - prv = convParamSpec.padding_right; + if (prv < convParamSpec.pad_right) { + prv = convParamSpec.pad_right; } pbv = h_align + (fhd / 2 * 2) - ptv - ih; - if (pbv < convParamSpec.padding_bottom) { - pbv = convParamSpec.padding_bottom; + if (pbv < convParamSpec.pad_bottom) { + pbv = convParamSpec.pad_bottom; } - } else { - prv = convParamSpec.padding_right; + } else { + prv = convParamSpec.pad_right; pbv = h_align + (fhd / 2 * 2) - ptv - ih; - if (pbv < convParamSpec.padding_bottom) { - pbv = convParamSpec.padding_bottom; + if (pbv < convParamSpec.pad_bottom) { + pbv = convParamSpec.pad_bottom; } ic = (ic + 3) / 4; } diff --git a/compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.cpp index b589879d..f92a3b15 100644 --- a/compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/convolution_wino_mali_fp16.cpp @@ -16,14 +16,15 @@ #include "gpu/mali/cl/kernel_option/conv_wino_opt.h" #include "gpu/mali/cl/kernel_option/gemm_tn_opt.h" -TensorDesc getInputPreProcessDesc(TensorDesc inputDesc, ConvolutionParamSpec convParamSpec, U32 wino_w, U32 wino_h) +TensorDesc getInputPreProcessDesc( + TensorDesc inputDesc, ConvolutionParamSpec convParamSpec, U32 wino_w, U32 wino_h) { U32 fw = convParamSpec.kernel_w; U32 fh = convParamSpec.kernel_h; - U32 pl = convParamSpec.padding_left; - U32 pr = convParamSpec.padding_right; - U32 pt = convParamSpec.padding_top; - U32 pb = convParamSpec.padding_bottom; + U32 pl = convParamSpec.pad_left; + U32 pr = convParamSpec.pad_right; + U32 pt = convParamSpec.pad_top; + U32 pb = convParamSpec.pad_bottom; TensorDesc desc = inputDesc; desc.df = DF_NCHW; desc.dims[0] = wino_w * 4; @@ -35,7 +36,7 @@ TensorDesc getInputPreProcessDesc(TensorDesc inputDesc, ConvolutionParamSpec con return desc; } -TensorDesc getPicTranDesc(DataType dt, U32 wino_w, U32 wino_h, U32 wino_num, U32 ic, U32 item_n) +TensorDesc getPicTranDesc(DataType dt, U32 wino_w, U32 wino_h, U32 wino_num, U32 ic, U32 item_n) { TensorDesc desc; desc.df = DF_NCHW; @@ -48,7 +49,7 @@ TensorDesc getPicTranDesc(DataType dt, U32 wino_w, U32 wino_h, U32 wino_num, U32 return desc; } -TensorDesc getGemmOutDesc(DataType dt, U32 M, U32 N, U32 wino_num) +TensorDesc getGemmOutDesc(DataType dt, U32 M, U32 N, U32 wino_num) { TensorDesc desc; desc.df = DF_NCHW; @@ -61,17 +62,30 @@ TensorDesc getGemmOutDesc(DataType dt, U32 M, U32 N, U32 wino_num) return desc; } -inline EE wino_preprocess_input(GCLHandle_t handle, DataType dt, DataFormat df, - U32 iw_str, U32 ih_str, U32 i_off, U32 ow_str, U32 oh_str, - U32 iw, U32 ih, U32 ic, U32 pw, U32 ph, - GCLMemType imt, GCLMemType omt, Mem in, Mem out) +inline EE wino_preprocess_input(GCLHandle_t handle, + DataType dt, + DataFormat df, + U32 iw_str, + U32 ih_str, + U32 i_off, + U32 ow_str, + U32 oh_str, + U32 iw, + U32 ih, + U32 ic, + U32 pw, + U32 ph, + GCLMemType imt, + GCLMemType omt, + Mem in, + Mem out) { char kernelName[128]; KernelOpt kernelOpt; Kernel kernel; - bool useNchwFormat = (df == DF_NCHW) ? true :false; - CHECK_STATUS(set_conv_wino_preprocess_input_opt(dt, useNchwFormat, imt, omt, - kernelName, &kernelOpt)); + bool useNchwFormat = (df == DF_NCHW) ? true : false; + CHECK_STATUS( + set_conv_wino_preprocess_input_opt(dt, useNchwFormat, imt, omt, kernelName, &kernelOpt)); U32 gs[3] = {(ow_str + 3) / 4, oh_str, (ic + 3) / 4}; U32 ls[3] = {0}; U32 dim = 3; @@ -80,8 +94,8 @@ inline EE wino_preprocess_input(GCLHandle_t handle, DataType dt, DataFormat df, gs[2] = ic; } CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, i_off, ow_str, oh_str, - iw, ih, ic, pw, ph, gs[0], gs[1], in, out)); + CHECK_STATUS(gcl_set_kernelArgs( + kernel, iw_str, ih_str, i_off, ow_str, oh_str, iw, ih, ic, pw, ph, gs[0], gs[1], in, out)); gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); #ifdef _DEBUG CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); @@ -90,21 +104,31 @@ inline EE wino_preprocess_input(GCLHandle_t handle, DataType dt, DataFormat df, return SUCCESS; } -inline EE wino_trans_pic_nchw(GCLHandle_t handle, DataType dt, U32 wino_w, U32 wino_h, U32 ic, - U32 iw_str, U32 ih_str, U32 i_off, U32 pw_str, U32 pwh_str, - GCLMemType imt, Mem in, Mem out) +inline EE wino_trans_pic_nchw(GCLHandle_t handle, + DataType dt, + U32 wino_w, + U32 wino_h, + U32 ic, + U32 iw_str, + U32 ih_str, + U32 i_off, + U32 pw_str, + U32 pwh_str, + GCLMemType imt, + Mem in, + Mem out) { char kernelName[128]; KernelOpt kernelOpt; Kernel kernel; - CHECK_STATUS(set_common_opt(dt, imt, GCL_MEM_BUF, "conv_wino_trans_picbuf_nchw", - kernelName, &kernelOpt)); + CHECK_STATUS( + set_common_opt(dt, imt, GCL_MEM_BUF, "conv_wino_trans_picbuf_nchw", kernelName, &kernelOpt)); U32 gs[3] = {wino_w, wino_h, ic}; U32 ls[3] = {0}; U32 dim = 3; CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, i_off, pw_str, pwh_str, - gs[0], gs[1], in, out)); + CHECK_STATUS( + gcl_set_kernelArgs(kernel, iw_str, ih_str, i_off, pw_str, pwh_str, gs[0], gs[1], in, out)); gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); #ifdef _DEBUG CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); @@ -129,15 +153,25 @@ inline EE wino_trans_pic_img(GCLHandle_t handle, TensorDesc picTranDesc, Mem pic return SUCCESS; } -inline EE wino_gemm(GCLHandle_t handle, DataType dt, - U32 M, U32 N, U32 K, U32 item_m, U32 item_n, U32 wino_num, - GCLMemType ma, GCLMemType mb, Mem A, Mem B, Mem C) +inline EE wino_gemm(GCLHandle_t handle, + DataType dt, + U32 M, + U32 N, + U32 K, + U32 item_m, + U32 item_n, + U32 wino_num, + GCLMemType ma, + GCLMemType mb, + Mem A, + Mem B, + Mem C) { char kernelName[128]; KernelOpt kernelOpt; Kernel kernel; - CHECK_STATUS(set_gemm_tn_opt_mali(item_m, item_n, NO_BIAS, false, ACTIVATION_NULL, dt, - ma, mb, GCL_MEM_BUF, kernelName, &kernelOpt)); + CHECK_STATUS(set_gemm_tn_opt_mali(item_m, item_n, NO_BIAS, false, ACTIVATION_NULL, dt, ma, mb, + GCL_MEM_BUF, kernelName, &kernelOpt)); U32 gs[3] = {N / item_n, M / item_m, wino_num * wino_num}; U32 ls[3] = {0}; U32 dim = 3; @@ -149,8 +183,8 @@ inline EE wino_gemm(GCLHandle_t handle, DataType dt, U32 ch = M; U32 cc = wino_num * wino_num; CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, K, A_str, B_str, C_str, 0, 0, 0, - cw_str, cw, ch, cc, gs[0], gs[1], A, B, C, C)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, M, N, K, A_str, B_str, C_str, 0, 0, 0, cw_str, cw, ch, + cc, gs[0], gs[1], A, B, C, C)); gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); #ifdef _DEBUG CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); @@ -184,8 +218,8 @@ inline EE wino_trans_out(GCLHandle_t handle, if ((oh & 3) == 0 && (ow & 3) == 0) { useAlign = true; } - CHECK_STATUS(set_conv_wino_trans_outbuf_opt(useAlign, activationMode, DT_F16, GCL_MEM_BUF, - omt, kernelName, &kernelOpt)); + CHECK_STATUS(set_conv_wino_trans_outbuf_opt( + useAlign, activationMode, DT_F16, GCL_MEM_BUF, omt, kernelName, &kernelOpt)); U32 gs[3] = {wino_w, wino_h, (oc + 3) / 4}; U32 ls[3] = {0, 0, 0}; U32 dim = 3; @@ -260,8 +294,8 @@ EE convolution_wino_transform_filter_mali_fp16(GCLHandle_t handle, U32 offset = ALIGN(fn_align * fwhc * bytesOf(fdt), BUFFER_ALIGN_BASE); CHECK_STATUS(gcl_create_sub_buffer(bytes, &offset, tmp, &fltTranMem)); } - CHECK_STATUS(set_common_opt(DT_F16, GCL_MEM_BUF, GCL_MEM_BUF, "conv_wino_trans_fltbuf_3x3", - kernelName, &kernelOpt)); + CHECK_STATUS(set_common_opt( + DT_F16, GCL_MEM_BUF, GCL_MEM_BUF, "conv_wino_trans_fltbuf_3x3", kernelName, &kernelOpt)); CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelName, &kernel, &kernelOpt)); CHECK_STATUS(gcl_set_kernelArgs(kernel, fn_align, fc, fnc, tmp->mem, fltTranMem)); gs[0] = fn_align; @@ -304,7 +338,7 @@ EE convolution_wino_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, if (useImg) { U32 width = (inputNchwDesc.dims[0] + 3) / 4; U32 height = inputNchwDesc.dims[1]; - U32 depth = inputNchwDesc.dims[2] * inputNchwDesc.dims[3]; + U32 depth = inputNchwDesc.dims[2] * inputNchwDesc.dims[3]; if (CHECK_MEET_IMAGE_LIMITS(width, height, depth)) { bytes[1] = width; bytes[2] = height; @@ -316,7 +350,7 @@ EE convolution_wino_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, if (!useImg) { bufSize += ALIGN(tensorNumBytes(inputNchwDesc), BUFFER_ALIGN_BASE); } - } else {//for input is NCHW and memType is image + } else { //for input is NCHW and memType is image bufSize += ALIGN(tensorNumBytes(inputNchwDesc), BUFFER_ALIGN_BASE); } @@ -383,17 +417,17 @@ EE convolution_wino_mali_fp16(GCLHandle_t handle, U32 ow, oh, oc, on; fw = convParamSpec.kernel_w; fh = convParamSpec.kernel_h; - pw = convParamSpec.padding_left; - ph = convParamSpec.padding_top; + pw = convParamSpec.pad_left; + ph = convParamSpec.pad_top; tensorSelectGet(inputDesc, &idt, NULL, NULL, &ic, &ih, &iw); tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); fc = ic; fn = oc; Mem inMem = input->mem; - U32 iw_str ,ih_str; + U32 iw_str, ih_str; I32 iw_off, ih_off, i_off; - get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, (U32*)&iw_off, (U32*)&ih_off); - U32 ow_str ,oh_str, ow_off, oh_off; + get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, (U32 *)&iw_off, (U32 *)&ih_off); + U32 ow_str, oh_str, ow_off, oh_off; get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); GCLMemType imt = input->desc.memType; @@ -408,7 +442,7 @@ EE convolution_wino_mali_fp16(GCLHandle_t handle, Mem inputPre; GCLMemType omt; bool useImg = (tmp[1]) ? true : false; - if (inputDesc.df == DF_NCHW) {//for padding input(must be image), have to set data to buffer + if (inputDesc.df == DF_NCHW) { //for padding input(must be image), have to set data to buffer useImg = false; } if (useImg) { @@ -422,9 +456,8 @@ EE convolution_wino_mali_fp16(GCLHandle_t handle, U32 tw_str = desc.dims[0]; U32 th_str = desc.dims[1]; i_off = ih_off * iw_str + iw_off; - CHECK_STATUS(wino_preprocess_input(handle, desc.dt, input->desc.df, - iw_str, ih_str, i_off, tw_str, th_str, - iw, ih, ic, pw, ph, imt, omt, inMem, inputPre)); + CHECK_STATUS(wino_preprocess_input(handle, desc.dt, input->desc.df, iw_str, ih_str, i_off, + tw_str, th_str, iw, ih, ic, pw, ph, imt, omt, inMem, inputPre)); inMem = inputPre; iw_str = tw_str; ih_str = th_str; @@ -441,8 +474,8 @@ EE convolution_wino_mali_fp16(GCLHandle_t handle, CHECK_STATUS(gcl_create_sub_buffer(picTranSize, &offset, tmp[0], &picTran)); U32 pw_str = picTranDesc.dims[0]; U32 pwh_str = pw_str * picTranDesc.dims[1]; - CHECK_STATUS(wino_trans_pic_nchw(handle, picTranDesc.dt, wino_w, wino_h, ic, - iw_str, ih_str, i_off, pw_str, pwh_str, imt, inMem, picTran)); + CHECK_STATUS(wino_trans_pic_nchw(handle, picTranDesc.dt, wino_w, wino_h, ic, iw_str, ih_str, + i_off, pw_str, pwh_str, imt, inMem, picTran)); if (tmp[2]) { CHECK_STATUS(wino_trans_pic_img(handle, picTranDesc, picTran, tmp[2]->mem)); picTran = tmp[2]->mem; @@ -459,12 +492,12 @@ EE convolution_wino_mali_fp16(GCLHandle_t handle, GCLMemType fltTranType = filter->desc.memType; CHECK_STATUS(gcl_create_sub_buffer(gemmOutSize, &offset, tmp[0], &gemmOut)); - CHECK_STATUS(wino_gemm(handle, idt, M, N, K, item_m, item_n, wino_num, - fltTranType, picTranType, fltTran, picTran, gemmOut)); + CHECK_STATUS(wino_gemm(handle, idt, M, N, K, item_m, item_n, wino_num, fltTranType, picTranType, + fltTran, picTran, gemmOut)); Mem biasbuf = bias->mem; Mem outbuf = output->mem; - CHECK_STATUS(wino_trans_out(handle, wino_w, wino_h, N, N * M, ow_str, oh_str, ow_off, oh_off, ow, - oh, oc, output->desc.memType, activationMode, biasbuf, gemmOut, outbuf)); + CHECK_STATUS(wino_trans_out(handle, wino_w, wino_h, N, N * M, ow_str, oh_str, ow_off, oh_off, + ow, oh, oc, output->desc.memType, activationMode, biasbuf, gemmOut, outbuf)); return SUCCESS; } diff --git a/compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.cpp index d477b7e2..e6a6ed68 100644 --- a/compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/deconvolution_gemm_mali_fp16.cpp @@ -42,8 +42,8 @@ inline EE deconv_gemm_core_mali_fp16(GCLHandle_t handle, U32 ow, oh, oc, on; sw = convParamSpec.stride_w; sh = convParamSpec.stride_h; - ph = convParamSpec.padding_top; - pw = convParamSpec.padding_left; + ph = convParamSpec.pad_top; + pw = convParamSpec.pad_left; fw = convParamSpec.kernel_w; fh = convParamSpec.kernel_h; tensorSelectGet(inputDesc, NULL, NULL, NULL, &ic, &ih, &iw); diff --git a/compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.cpp index 8cb35531..b52b4af6 100644 --- a/compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/depth2space_mali_fp16.cpp @@ -56,7 +56,7 @@ inline EE depth2space_core_mali_fp16(GCLHandle_t handle, char kernelName[128]; KernelOpt kernelOpt; - if (imf == DF_NCHWC4 && p.blockSize == 2) { + if (imf == DF_NCHWC4 && p.block_size == 2) { U32 gs[3] = {iw, ih, (ic_str + 3) / 4}; U32 ls[3] = {0, 0, 0}; U32 dim = 3; @@ -64,8 +64,8 @@ inline EE depth2space_core_mali_fp16(GCLHandle_t handle, CHECK_STATUS(set_depth2space_nchwc4_2x2_opt( useOutputNchw, DT_F16, input->desc.memType, GCL_MEM_BUF, kernelName, &kernelOpt)); CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, p.blockSize, iw_str, ihw_str, ic_str, i_off, ow_str, - oh_str, ohw_str, o_off, iw, ih, oc, inbuf, outbuf)); + CHECK_STATUS(gcl_set_kernelArgs(kernel, p.block_size, iw_str, ihw_str, ic_str, i_off, + ow_str, oh_str, ohw_str, o_off, iw, ih, oc, inbuf, outbuf)); gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); #ifdef _DEBUG CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); @@ -85,13 +85,13 @@ inline EE depth2space_core_mali_fp16(GCLHandle_t handle, inbuf = tmp; } U32 gs[3] = { - iw, ih, (ic / (p.blockSize * p.blockSize) + 3) / 4 * (p.blockSize * p.blockSize)}; + iw, ih, (ic / (p.block_size * p.block_size) + 3) / 4 * (p.block_size * p.block_size)}; U32 ls[3] = {0, 0, 0}; U32 dim = 3; CHECK_STATUS(set_common_opt( DT_F16, GCL_MEM_BUF, GCL_MEM_BUF, "depth2space_nchw", kernelName, &kernelOpt)); CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt)); - CHECK_STATUS(gcl_set_kernelArgs(kernel, p.blockSize, iw_str, ihw_str, ow_str, ohw_str, + CHECK_STATUS(gcl_set_kernelArgs(kernel, p.block_size, iw_str, ihw_str, ow_str, ohw_str, i_off, o_off, iw, ih, ic, inbuf, outbuf)); gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName); #ifdef _DEBUG @@ -110,7 +110,7 @@ EE depth2space_infer_tmpBuf_size_mali_fp16( U32 iw, ih, ic, in; tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); *bytes = 0; - if (p.blockSize != 2) { + if (p.block_size != 2) { *bytes = in * ic * ih * iw * bytesOf(idt); } return SUCCESS; diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.cpp index 0ff1d6ec..7cf8fcc6 100644 --- a/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_direct_mali_fp16.cpp @@ -45,8 +45,8 @@ inline EE depthwise_core_mali_fp16(GCLHandle_t handle, U32 ow, oh, oc, on; sw = convParamSpec.stride_w; sh = convParamSpec.stride_h; - pw = convParamSpec.padding_left; - ph = convParamSpec.padding_top; + pw = convParamSpec.pad_left; + ph = convParamSpec.pad_top; dw = convParamSpec.dilatedRate_w; dh = convParamSpec.dilatedRate_h; fw = convParamSpec.kernel_w; diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.h b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.h index fd018781..ad7c3f9c 100644 --- a/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_convolution_mali_fp16.h @@ -29,12 +29,12 @@ inline void calDepthwisePaddingVal(TensorDesc inputDesc, U32 dh = convParamSpec.dilatedRate_h; U32 fhd = (fh - 1) * dh + 1; U32 ih = inputDesc.dims[1]; - U32 plv = convParamSpec.padding_left; - U32 prv = convParamSpec.padding_right; - U32 ptv = convParamSpec.padding_top; + U32 plv = convParamSpec.pad_left; + U32 prv = convParamSpec.pad_right; + U32 ptv = convParamSpec.pad_top; U32 pbv = edge_align * sh + (fhd / 2) * 2 - ptv - ih; - if (pbv < convParamSpec.padding_bottom) { - pbv = convParamSpec.padding_bottom; + if (pbv < convParamSpec.pad_bottom) { + pbv = convParamSpec.pad_bottom; } *pl = plv; *pr = prv; diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.cpp index d3334951..1d8c639a 100644 --- a/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_direct_mali_fp16.cpp @@ -48,8 +48,8 @@ inline EE depthwise_pointwise_direct_core_mali_fp16(GCLHandle_t handle, U32 ow, oh, oc, on; sw = convParamSpec.stride_w; sh = convParamSpec.stride_h; - ph = convParamSpec.padding_top; - pw = convParamSpec.padding_left; + ph = convParamSpec.pad_top; + pw = convParamSpec.pad_left; dw = convParamSpec.dilatedRate_w; dh = convParamSpec.dilatedRate_h; fw = convParamSpec.kernel_w; diff --git a/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.cpp index 5e350d79..64de44f2 100644 --- a/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/depthwise_pointwise_convolution_gemm_mali_fp16.cpp @@ -49,8 +49,8 @@ inline EE depthwise_pointwise_gemm_core_mali_fp16(GCLHandle_t handle, U32 ow, oh, oc, on; sw = convParamSpec.stride_w; sh = convParamSpec.stride_h; - ph = convParamSpec.padding_top; - pw = convParamSpec.padding_left; + ph = convParamSpec.pad_top; + pw = convParamSpec.pad_left; dw = convParamSpec.dilatedRate_w; dh = convParamSpec.dilatedRate_h; fw = convParamSpec.kernel_w; diff --git a/compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.cpp index a04ddae0..2bda3f91 100644 --- a/compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/eltwise_mali_fp16.cpp @@ -147,7 +147,7 @@ inline EE eltwise_core_mali_fp16(GCLHandle_t handle, KernelOpt kernelOpt; char kernelName[128]; bool useNchwFormat = (inputMem[arrayDimMax]->desc.memFormat == DF_NCHW) ? true : false; - EltwiseMode eltwiseMode = eltwiseDesc.elt_mode; + EltwiseMode eltwiseMode = eltwiseDesc.mode; ActivationMode activeMode = eltwiseDesc.activation_type; U32 gs[3] = {iw, ih, (ic + 3) / 4 * in * it}; U32 ls[3] = {0, 0, 0}; diff --git a/compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.cpp index 3d9100f6..4fcec723 100644 --- a/compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/padding_mali_fp16.cpp @@ -27,11 +27,11 @@ inline EE padding_checkpara_mali_fp16(GCLHandle_t handle, if (inputDesc.dt != outputDesc.dt || inputDesc.dt != DT_F16) { return NOT_SUPPORTED; } - if (padParamSpec.pad_mode == Pad_Reflect && + if (padParamSpec.pad_mode == PAD_REFLECT && (padParamSpec.top >= inputDesc.dims[1] || padParamSpec.bottom >= inputDesc.dims[1])) { return NOT_SUPPORTED; } - if (padParamSpec.pad_mode == Pad_Symmetric && + if (padParamSpec.pad_mode == PAD_SYMMETRIC && (padParamSpec.left > inputDesc.dims[0] || padParamSpec.right > inputDesc.dims[0])) { return NOT_SUPPORTED; } diff --git a/compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.cpp index 48a0fff2..6b47d7c0 100644 --- a/compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/pooling_mali_fp16.cpp @@ -33,10 +33,10 @@ inline EE pooling_checkpara_mali_fp16(GCLHandle_t handle, if (inputDesc.dims[2] != outputDesc.dims[2] || inputDesc.dims[3] != outputDesc.dims[3]) { return NOT_SUPPORTED; } - if (poolingParamSpec.padding_top >= poolingParamSpec.kernel_h) { + if (poolingParamSpec.pad_top >= poolingParamSpec.kernel_h) { return NOT_SUPPORTED; } - if (poolingParamSpec.padding_bottom >= poolingParamSpec.kernel_w) { + if (poolingParamSpec.pad_bottom >= poolingParamSpec.kernel_w) { return NOT_SUPPORTED; } if (input->desc.memFormat != output->desc.memFormat || input->desc.memFormat != DF_NCHWC4) { @@ -74,9 +74,9 @@ inline EE pooling_core_mali_fp16(GCLHandle_t handle, sw = poolingParamSpec.stride_w; sh = poolingParamSpec.stride_h; st = poolingParamSpec.stride_t; - pw = poolingParamSpec.padding_left; - ph = poolingParamSpec.padding_top; - pt = poolingParamSpec.padding_before; + pw = poolingParamSpec.pad_left; + ph = poolingParamSpec.pad_top; + pt = poolingParamSpec.pad_before; kw = poolingParamSpec.kernel_w; kh = poolingParamSpec.kernel_h; kt = poolingParamSpec.kernel_t; @@ -134,7 +134,8 @@ inline EE pooling_core_mali_fp16(GCLHandle_t handle, mode, DT_F16, input->desc.memType, output->desc.memType, kernelName, &kernelOpt)); CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt)); CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, iw_off, ih_off, ow_str, oh_str, - o_off, iw, ih, ow, oh, sw, sh, pw, ph, kw, kh, inbuf, outbuf)); + o_off, iw, ih, ow, oh, sw, sh, pw, ph, kw, kh, (int)poolingParamSpec.count_include_pad, + inbuf, outbuf)); CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName)); #ifdef _DEBUG CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); diff --git a/compute/tensor/src/gpu/mali/fp16/reduction_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/reduction_mali_fp16.cpp index 77268dee..a8190758 100644 --- a/compute/tensor/src/gpu/mali/fp16/reduction_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/reduction_mali_fp16.cpp @@ -36,7 +36,7 @@ inline EE reduction_core_mali_fp16(GCLHandle_t handle, { int axisTran[6]; int axis; - for (int i = 0; i < p.axes_num; i++) { + for (int i = 0; i < p.num_axes; i++) { axis = p.axes[i]; if (axis < 0) { axis = inputDesc.nDims + axis; @@ -97,8 +97,8 @@ inline EE reduction_core_mali_fp16(GCLHandle_t handle, useNchw = true; edge = ow; } - CHECK_STATUS(set_reduction_opt_mali(useNchw, useOc4, axis, p.reduction_mode, DT_F16, - GCL_MEM_BUF, GCL_MEM_BUF, kernelName, &kernelOpt)); + CHECK_STATUS(set_reduction_opt_mali( + useNchw, useOc4, axis, p.mode, DT_F16, GCL_MEM_BUF, GCL_MEM_BUF, kernelName, &kernelOpt)); CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt)); CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, ow_str, oh_str, i_off, o_off, iw, ih, ic, edge, keep_dim, od, gs[0], gs[1], inbuf, outbuf)); diff --git a/compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.cpp index 9779aecc..94a1cecf 100644 --- a/compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/rnn_mali_fp16.cpp @@ -232,9 +232,9 @@ inline EE rnn_core_update(GCLHandle_t handle, char *kernelName, KernelOpt *kernelOpt) { - float fbias = rnnPara.forgetBias; - float zonecell = rnnPara.zoneoutCell; - float zoneout = rnnPara.zoneoutOutput; + float fbias = rnnPara.forget_bias; + float zonecell = rnnPara.zoneout_cell; + float zoneout = rnnPara.zoneout_output; U32 gs = (col + 3) / 4; U32 ls = 16; U32 dim = 1; @@ -310,7 +310,7 @@ inline EE rnn_core_mali_fp16(GCLHandle_t handle, GCLMem_t output, ForwardRunInfoMali_t forwardRunInfo) { - bool project = (rnnPara.numProjection > 0) ? true : false; + bool project = (rnnPara.num_projection > 0) ? true : false; if (project) { CHECK_STATUS(NOT_SUPPORTED); } @@ -337,8 +337,8 @@ inline EE rnn_core_mali_fp16(GCLHandle_t handle, U32 batch = desc.dims[desc.nDims - 1]; U32 step = desc.dims[desc.nDims - 2]; U32 xDim = desc.dims[desc.nDims - 3]; - U32 hDim = rnnPara.numOutput; - U32 col = (rnnPara.numProjection > 0) ? rnnPara.numProjection : hDim; + U32 hDim = rnnPara.num_outputs; + U32 col = (rnnPara.num_projection > 0) ? rnnPara.num_projection : hDim; for (U32 i = 0; i < desc.nDims - 3; i++) { xDim *= desc.dims[i]; } @@ -421,7 +421,7 @@ inline EE rnn_core_mali_fp16(GCLHandle_t handle, rnn_core_copy_stateH(handle, col, hDim, outputDescs.size(), false, stateH, output)); } - if (rnnPara.biDirection) { + if (rnnPara.bi_direction) { gemmMatB = filter[filterCount].mem; gemmMatBType = filter[filterCount].desc.memType; gemmBias = bias[biasCount].mem; @@ -458,7 +458,7 @@ inline void transform_filter_desc(TensorDesc filterDesc, { U32 filterRow, filterCol; tensorSelectGet(filterDesc, NULL, NULL, NULL, NULL, &filterRow, &filterCol); - U32 hDim = rnnPara.numOutput; + U32 hDim = rnnPara.num_outputs; U32 xDim = filterCol - hDim; TensorDesc desc; @@ -499,7 +499,7 @@ EE rnn_transform_filter_mali_fp16(GCLHandle_t handle, DataType fdt; U32 filterRow, filterCol; tensorSelectGet(filterDesc, &fdt, NULL, NULL, NULL, &filterRow, &filterCol); - U32 hDim = rnnPara.numOutput; + U32 hDim = rnnPara.num_outputs; U32 xDim = filterCol - hDim; char kernelName[128]; KernelOpt kernelOpt; @@ -515,7 +515,7 @@ EE rnn_transform_filter_mali_fp16(GCLHandle_t handle, CHECK_STATUS(gcl_create_sub_buffer(weightGemmSize, &subMemOff, tmpBuf, &weightGemm)); CHECK_STATUS(gcl_create_sub_buffer(weightGemvSize, &subMemOff, tmpBuf, &weightGemv)); - U32 biDirNum = (rnnPara.biDirection) ? 2 : 1; + U32 biDirNum = (rnnPara.bi_direction) ? 2 : 1; U32 filterCount = 0; U32 filterTranCount = 0; U32 item_n = forwardRunInfo->best_h[0]; @@ -596,8 +596,8 @@ EE rnn_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, size += ALIGN(gemmMatASize, BUFFER_ALIGN_BASE); } - U32 hDim = rnnPara.numOutput; - U32 col = (rnnPara.numProjection > 0) ? rnnPara.numProjection : hDim; + U32 hDim = rnnPara.num_outputs; + U32 col = (rnnPara.num_projection > 0) ? rnnPara.num_projection : hDim; U32 filterRow = col * 4; U32 M = ALIGN(step * batch, item_m); U32 N = ALIGN(filterRow, item_n); diff --git a/compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.cpp index 87f2a536..83b4dc7e 100644 --- a/compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/rnncell_mali_fp16.cpp @@ -44,12 +44,12 @@ inline EE rnncell_core_mali_fp16(GCLHandle_t handle, ForwardRunInfoMali_t forwardRunInfo) { U32 item_c = forwardRunInfo->best_c[0]; - U32 hDim = rnncellDesc.numOutput; - U32 col = (rnncellDesc.numProjection > 0) ? rnncellDesc.numProjection : hDim; - bool project = (rnncellDesc.numProjection > 0) ? true : false; - float fbias = rnncellDesc.forgetBias; - float zonecell = rnncellDesc.zoneoutCell; - float zoneout = rnncellDesc.zoneoutOutput; + U32 hDim = rnncellDesc.num_outputs; + U32 col = (rnncellDesc.num_projection > 0) ? rnncellDesc.num_projection : hDim; + bool project = (rnncellDesc.num_projection > 0) ? true : false; + float fbias = rnncellDesc.forget_bias; + float zonecell = rnncellDesc.zoneout_cell; + float zoneout = rnncellDesc.zoneout_output; U32 xw_str, xh_str, xh_off, xw_off; U32 hw_str, hh_str, hh_off, hw_off; CHECK_STATUS(gclmem_get_desc_padding(currentX->desc, &xw_str, &xh_str, NULL, &xw_off, &xh_off)); @@ -123,7 +123,7 @@ inline EE rnncell_core_mali_fp16(GCLHandle_t handle, if (project) { item_c = forwardRunInfo->best_c[1]; - filterRow = rnncellDesc.numOutput; + filterRow = rnncellDesc.num_outputs; fltbuf = filter[1].mem; tmpOff = offset; //biasMem = bias[1].mem; @@ -150,13 +150,13 @@ inline void transform_filter_desc(TensorDesc filterDesc, U32 item_c = forwardRunInfo->best_c[0]; U32 item_k = forwardRunInfo->best_k[0]; ftmDesc[0] = gemv_transform_filter_desc(filterDesc, item_h, item_c, item_k); - bool useProject = (rnnParamSpec.numProjection > 0) ? true : false; + bool useProject = (rnnParamSpec.num_projection > 0) ? true : false; if (useProject) { item_h = forwardRunInfo->best_h[1]; item_c = forwardRunInfo->best_c[1]; item_k = forwardRunInfo->best_k[1]; - TensorDesc filterDescPro = - tensor2df(filterDesc.dt, DF_NORMAL, rnnParamSpec.numOutput, rnnParamSpec.numProjection); + TensorDesc filterDescPro = tensor2df( + filterDesc.dt, DF_NORMAL, rnnParamSpec.num_outputs, rnnParamSpec.num_projection); ftmDesc[1] = gemv_transform_filter_desc(filterDescPro, item_h, item_c, item_k); } } @@ -178,15 +178,15 @@ EE rnncell_transform_filter_mali_fp16(GCLHandle_t handle, GCLMem_t fltmem, ForwardRunInfoMali_t forwardRunInfo) { - U32 filterNum = (rnnParamSpec.numProjection > 0) ? 2 : 1; + U32 filterNum = (rnnParamSpec.num_projection > 0) ? 2 : 1; for (U32 i = 0; i < filterNum; i++) { ForwardRunInfoMali runInfo = *forwardRunInfo; if (i == 1) { runInfo.best_h[i - 1] = runInfo.best_h[i]; runInfo.best_c[i - 1] = runInfo.best_c[i]; runInfo.best_k[i - 1] = runInfo.best_k[i]; - filterDesc.dims[0] = rnnParamSpec.numProjection; - filterDesc.dims[1] = rnnParamSpec.numOutput; + filterDesc.dims[0] = rnnParamSpec.num_projection; + filterDesc.dims[1] = rnnParamSpec.num_outputs; } CHECK_STATUS(gemv_transform_filter_mali_fp16( handle, filterDesc, &filter[i], &fltmemDesc[i], &fltmem[i], &runInfo)); @@ -204,12 +204,12 @@ EE rnncell_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, U32 item_c = forwardRunInfo->best_c[0]; DataType dt = inputDesc.dt; U32 xDim = inputDesc.dims[0]; - U32 hDim = rnncellDesc.numOutput; + U32 hDim = rnncellDesc.num_outputs; U32 c_align = (item_c > 16) ? (item_c >> 4) : item_c; U32 xhNum = ALIGN(xDim + hDim, c_align); U32 xhSize = ALIGN(xhNum * bytesOf(dt), BUFFER_ALIGN_BASE); - U32 col = (rnncellDesc.numProjection > 0) ? rnncellDesc.numProjection : hDim; + U32 col = (rnncellDesc.num_projection > 0) ? rnncellDesc.num_projection : hDim; U32 filterRow = col * 4; U32 interNum = filterRow + 4; U32 interSize = ALIGN(interNum * bytesOf(dt), BUFFER_ALIGN_BASE); @@ -217,12 +217,12 @@ EE rnncell_infer_forward_tmp_bytes_mali_fp16(TensorDesc inputDesc, U32 tmpOutSize = 0; U32 filterRowPro = 0; U32 item_cp = item_c; - if (rnncellDesc.numProjection > 0) { + if (rnncellDesc.num_projection > 0) { item_cp = forwardRunInfo->best_c[1]; U32 cp_align = (item_cp > 16) ? (item_cp >> 4) : item_cp; U32 tmpOutNum = ALIGN(col, cp_align); tmpOutSize = ALIGN(tmpOutNum * bytesOf(dt), BUFFER_ALIGN_BASE); - filterRowPro = rnncellDesc.numOutput; + filterRowPro = rnncellDesc.num_outputs; } U32 reduceSize = 0; diff --git a/compute/tensor/src/gpu/mali/fp16/tensor_computing_fp16.h b/compute/tensor/src/gpu/mali/fp16/tensor_computing_fp16.h index 05c8ba25..5947e040 100644 --- a/compute/tensor/src/gpu/mali/fp16/tensor_computing_fp16.h +++ b/compute/tensor/src/gpu/mali/fp16/tensor_computing_fp16.h @@ -24,4 +24,97 @@ (gcl_check_meet_device_image3d_limits( \ OCLContext::getInstance().handle.get(), width, height, depth)) +inline std::vector build_conv_forward_algorithm_flag(TensorDesc inputDesc, + std::vector filterDesc, + OperatorType opType, + GCLMemType imt, + GCLMemType omt, + ConvolutionParamSpec convParamSpec) +{ + std::vector flag; + flag.push_back(opType); + flag.push_back(convParamSpec.convolution_type); + for (U32 i = 0; i < inputDesc.nDims; i++) { + flag.push_back(inputDesc.dims[i]); + } + for (auto &p : filterDesc) { + for (U32 i = 0; i < p.nDims; i++) { + flag.push_back(p.dims[i]); + } + } + flag.push_back(convParamSpec.kernel_t); + flag.push_back(convParamSpec.kernel_h); + flag.push_back(convParamSpec.kernel_w); + flag.push_back(convParamSpec.stride_t); + flag.push_back(convParamSpec.stride_h); + flag.push_back(convParamSpec.stride_w); + flag.push_back(convParamSpec.group); + flag.push_back(convParamSpec.dilatedRate_t); + flag.push_back(convParamSpec.dilatedRate_h); + flag.push_back(convParamSpec.dilatedRate_w); + flag.push_back(imt); + flag.push_back(omt); + return flag; +} + +inline std::vector build_fully_connected_forward_algorithm_flag( + TensorDesc inputDesc, TensorDesc filterDesc, GCLMemType imt, GCLMemType omt) +{ + std::vector flag; + flag.push_back(OT_FC); + for (U32 i = 0; i < inputDesc.nDims; i++) { + flag.push_back(inputDesc.dims[i]); + } + for (U32 i = 0; i < filterDesc.nDims; i++) { + flag.push_back(filterDesc.dims[i]); + } + flag.push_back(imt); + flag.push_back(omt); + return flag; +} + +inline std::vector build_matmul_forward_algorithm_flag(TensorDesc matrixADesc, + bool transposeA, + TensorDesc matrixBDesc, + bool transposeB, + GCLMemType amt, + GCLMemType bmt, + GCLMemType cmt) +{ + std::vector flag; + flag.push_back(OT_MatMul); + flag.push_back(transposeA); + flag.push_back(transposeB); + for (U32 i = 0; i < matrixADesc.nDims; i++) { + flag.push_back(matrixADesc.dims[i]); + } + for (U32 i = 0; i < matrixBDesc.nDims; i++) { + flag.push_back(matrixBDesc.dims[i]); + } + flag.push_back(amt); + flag.push_back(bmt); + flag.push_back(cmt); + return flag; +} + +inline std::vector build_rnn_forward_algorithm_flag( + TensorDesc inputDesc, std::vector filterDesc, RNNParamSpec rnnPara) +{ + std::vector flag; + flag.push_back(OT_RNN); + flag.push_back(rnnPara.steps); + flag.push_back(rnnPara.mode); + flag.push_back(rnnPara.num_outputs); + flag.push_back(rnnPara.num_projection); + flag.push_back(rnnPara.bi_direction); + for (U32 i = 0; i < inputDesc.nDims; i++) { + flag.push_back(inputDesc.dims[i]); + } + for (auto &p : filterDesc) { + for (U32 i = 0; i < p.nDims; i++) { + flag.push_back(p.dims[i]); + } + } + return flag; +} #endif diff --git a/compute/tensor/src/gpu/mali/fp16/tfslice_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/tfslice_mali_fp16.cpp index cec878b9..7b57577d 100644 --- a/compute/tensor/src/gpu/mali/fp16/tfslice_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/tfslice_mali_fp16.cpp @@ -56,7 +56,6 @@ inline EE tfslice_core_mali_fp16(GCLHandle_t handle, DataFormat imf = input->desc.memFormat; DataFormat omf = output->desc.memFormat; - char kernelName[128]; Kernel kernel; U32 gs[3] = {0, 0, 0}; U32 ls[3] = {0, 0, 0}; @@ -87,7 +86,7 @@ inline EE tfslice_core_mali_fp16(GCLHandle_t handle, gs[0] = ow; gs[1] = oh; gs[2] = oc * on; - sprintf(kernelName, "tfslice_nchw"); + const char *kernelName = "tfslice_nchw"; CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, ow_str, oh_str, i_off, o_off, ic, oc, be[0], be[1], be[2], be[3], stride[0], stride[1], stride[2], stride[3], gs[0], gs[1], inMem, diff --git a/compute/tensor/src/gpu/mali/fp16/topk_mali_fp16.cpp b/compute/tensor/src/gpu/mali/fp16/topk_mali_fp16.cpp index 7bf03f6a..45ee9cad 100644 --- a/compute/tensor/src/gpu/mali/fp16/topk_mali_fp16.cpp +++ b/compute/tensor/src/gpu/mali/fp16/topk_mali_fp16.cpp @@ -43,15 +43,16 @@ inline EE topk_core_mali_fp16(GCLHandle_t handle, axis = inputDesc.nDims - 1 - axis; U32 len = inputDesc.dims[axis]; I32 sorted = p.sorted; - I32 top_k = p.topk; + I32 top_k = p.k; I32 largest = p.largest; - char modeName[128]; + std::string modeName; if (largest) { - strcpy(modeName, "max"); + modeName = "max"; } else { - strcpy(modeName, "min"); + modeName = "min"; } if (sorted) { + UNI_ERROR_LOG("GPU have not support topK sorted"); CHECK_STATUS(NOT_SUPPORTED); } Mem outputId = outputIndices->mem; @@ -89,17 +90,16 @@ inline EE topk_core_mali_fp16(GCLHandle_t handle, CHECK_STATUS(gcl_create_sub_buffer(size, &sub_off, tmpbuf, &sub_id[3])); Kernel kernel; - char kernelName[1024]; - sprintf(kernelName, "topk_sort_%s", modeName); - CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + std::string kernelName = "topk_sort_" + modeName; + CHECK_STATUS(gcl_create_kernel(handle, kernelName.c_str(), &kernel)); U32 gs[3] = {0, 0, 0}; U32 ls[3] = {0, 0, 0}; U32 dim = 1; gs[0] = (len + 15) / 16; CHECK_STATUS(gcl_set_kernelArgs(kernel, len, gs[0], input->mem, sub[0], sub_id[0])); - CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName)); + CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName.c_str())); #ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName.c_str())); #endif U32 top_k_loop = (top_k + 15) / 16; @@ -108,7 +108,7 @@ inline EE topk_core_mali_fp16(GCLHandle_t handle, U32 mem_out_index = 1; U32 out_off = 0; U32 out_val_num = 16; - sprintf(kernelName, "topk_merge_%s", modeName); + kernelName = "topk_merge_" + modeName; Mem merge_in, merge_out, merge_in_id, merge_out_id; gs[0] = (len + 15) / 16; ls[0] = 0; @@ -124,12 +124,12 @@ inline EE topk_core_mali_fp16(GCLHandle_t handle, out_off = i * 16; out_val_num = ((i * 16 + 16) <= (U32)top_k) ? 16 : (top_k % 16); } - CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_create_kernel(handle, kernelName.c_str(), &kernel)); CHECK_STATUS(gcl_set_kernelArgs(kernel, total_group_num, out_val_num, out_off, gs[0], merge_in, merge_in_id, merge_out, merge_out_id)); - CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName)); + CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName.c_str())); #ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName.c_str())); #endif if (gs[0] > 1) { mem_in_index++; @@ -144,7 +144,7 @@ inline EE topk_core_mali_fp16(GCLHandle_t handle, } if (i < top_k_loop - 1 || need_out_id) { - sprintf(kernelName, "topk_update_%s", modeName); + kernelName = "topk_update_" + modeName; gs[0] = 16; ls[0] = 16; int out_id_off = out_off; @@ -152,12 +152,12 @@ inline EE topk_core_mali_fp16(GCLHandle_t handle, if (!need_out_id) { outputId = sub_id[0]; } - CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel)); + CHECK_STATUS(gcl_create_kernel(handle, kernelName.c_str(), &kernel)); CHECK_STATUS(gcl_set_kernelArgs(kernel, need_out_id, out_id_off, out_id_num, gs[0], merge_out_id, sub[0], sub_id[0], outputId)); - CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName)); + CHECK_STATUS(gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName.c_str())); #ifdef _DEBUG - CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName)); + CHECK_STATUS(gcl_run_kernel(handle, kernel, dim, gs, ls, kernelName.c_str())); #endif } } diff --git a/compute/tensor/src/gpu/mali/fully_connected.cpp b/compute/tensor/src/gpu/mali/fully_connected.cpp index ab00b100..3ccfae4e 100644 --- a/compute/tensor/src/gpu/mali/fully_connected.cpp +++ b/compute/tensor/src/gpu/mali/fully_connected.cpp @@ -98,6 +98,13 @@ EE fully_connected_infer_forward_algorithm_mali(GCLHandle_t handle, if (algorithm != CONVOLUTION_ALGORITHM_NULL) { return SUCCESS; } + GCLMemType imt = inputMemDesc.memType; + GCLMemType omt = outputMemDesc.memType; + std::vector flag = build_fully_connected_forward_algorithm_flag( + inputDesc, filterDesc, imt, omt); + if (gcl_get_runInfo_from_cache(handle, flag, forwardRunInfo)) { + return SUCCESS; + } DataType dt = inputDesc.dt; U32 fc = filterDesc.dims[0]; U32 fn = filterDesc.dims[1]; @@ -229,6 +236,7 @@ EE fully_connected_infer_forward_algorithm_mali(GCLHandle_t handle, CHECK_STATUS(NOT_SUPPORTED); } *forwardRunInfo = bestRunInfo; + gcl_set_runInfo_to_cache(handle, flag, bestRunInfo); CHECK_STATUS(gcl_finish(handle)); gcl_destroy_gclmem(input); gcl_destroy_gclmem(tmpBuf); diff --git a/compute/tensor/src/gpu/mali/matmul.cpp b/compute/tensor/src/gpu/mali/matmul.cpp index 2e4b6199..fbbc8396 100644 --- a/compute/tensor/src/gpu/mali/matmul.cpp +++ b/compute/tensor/src/gpu/mali/matmul.cpp @@ -202,6 +202,14 @@ EE matmul_infer_forward_algorithm_mali(GCLHandle_t handle, if (algorithm != CONVOLUTION_ALGORITHM_NULL) { return SUCCESS; } + GCLMemType amt = gclmemMatrixADesc.memType; + GCLMemType bmt = gclmemMatrixBDesc.memType; + GCLMemType cmt = gclmemMatrixCDesc.memType; + std::vector flag = build_matmul_forward_algorithm_flag( + matrixADesc, transposeA, matrixBDesc, transposeB, amt, bmt, cmt); + if (gcl_get_runInfo_from_cache(handle, flag, forwardRunInfo)) { + return SUCCESS; + } std::vector matmulAlgorithms; std::vector vecH; std::vector vecC; @@ -290,6 +298,7 @@ EE matmul_infer_forward_algorithm_mali(GCLHandle_t handle, CHECK_STATUS(NOT_SUPPORTED); } *forwardRunInfo = bestRunInfo; + gcl_set_runInfo_to_cache(handle, flag, bestRunInfo); CHECK_STATUS(gcl_finish(handle)); gcl_destroy_gclmem(matrixA); gcl_destroy_gclmem(matrixB); diff --git a/compute/tensor/src/gpu/mali/pooling.cpp b/compute/tensor/src/gpu/mali/pooling.cpp index 24e16ea6..4cde1ff5 100644 --- a/compute/tensor/src/gpu/mali/pooling.cpp +++ b/compute/tensor/src/gpu/mali/pooling.cpp @@ -27,12 +27,12 @@ EE pooling_padding_input_mali(TensorDesc inputDesc, if (inputMem == nullptr || outputMem == nullptr || outputDesc == nullptr) { CHECK_STATUS(NULL_POINTER); } - U32 pl = poolingParamSpec.padding_left; - U32 pr = poolingParamSpec.padding_right; - U32 pt = poolingParamSpec.padding_top; - U32 pb = poolingParamSpec.padding_bottom; - U32 pf = poolingParamSpec.padding_before; - U32 pa = poolingParamSpec.padding_after; + U32 pl = poolingParamSpec.pad_left; + U32 pr = poolingParamSpec.pad_right; + U32 pt = poolingParamSpec.pad_top; + U32 pb = poolingParamSpec.pad_bottom; + U32 pf = poolingParamSpec.pad_before; + U32 pa = poolingParamSpec.pad_after; inputMem->padding(pl, pr, pt, pb, pf, pa); return SUCCESS; } diff --git a/compute/tensor/src/gpu/mali/reduction.cpp b/compute/tensor/src/gpu/mali/reduction.cpp index 8dd5a1e6..99f3f85d 100644 --- a/compute/tensor/src/gpu/mali/reduction.cpp +++ b/compute/tensor/src/gpu/mali/reduction.cpp @@ -37,7 +37,7 @@ inline EE reduction_checkpara_mali(GCLHandle_t handle, if (tensorNumElements(maskDesc) != 0) { CHECK_STATUS(NOT_SUPPORTED); //unsupport currently } - if (p.axes_num > 1) { + if (p.num_axes > 1) { CHECK_STATUS(NOT_SUPPORTED); } int axis = p.axes[0]; @@ -64,7 +64,7 @@ EE reduction_padding_input_mali(TensorDesc inputDesc, int axisTran[6]; TensorDesc tmpDesc = inputDesc; - for (int i = 0; i < p.axes_num; i++) { + for (int i = 0; i < p.num_axes; i++) { int axis = p.axes[i]; if (axis < 0) { axis = tmpDesc.nDims + axis; diff --git a/compute/tensor/src/gpu/mali/rnn.cpp b/compute/tensor/src/gpu/mali/rnn.cpp index cb9fc93b..7859e490 100644 --- a/compute/tensor/src/gpu/mali/rnn.cpp +++ b/compute/tensor/src/gpu/mali/rnn.cpp @@ -92,6 +92,10 @@ EE rnn_infer_forward_algorithm_mali(GCLHandle_t handle, if (algorithm != CONVOLUTION_ALGORITHM_NULL) { return SUCCESS; } + std::vector flag = build_rnn_forward_algorithm_flag(inputDesc, filterDescs, rnnPara); + if (gcl_get_runInfo_from_cache(handle, flag, forwardRunInfo)) { + return SUCCESS; + } std::vector rnnAlgorithms; std::vector algoNumIndexGemm; std::vector vecHGemm; @@ -105,7 +109,7 @@ EE rnn_infer_forward_algorithm_mali(GCLHandle_t handle, std::vector vecHGemvPro; std::vector vecCGemvPro; std::vector vecKGemvPro; - bool useProjection = (rnnPara.numProjection > 0) ? true : false; + bool useProjection = (rnnPara.num_projection > 0) ? true : false; U32 filterCol = filterDescs[0].dims[0]; U32 filterRow = filterDescs[0].dims[1]; U32 filterColPro = (useProjection) ? filterDescs[1].dims[0] : filterCol; @@ -267,7 +271,7 @@ EE rnn_infer_forward_algorithm_mali(GCLHandle_t handle, outputDescs.push_back(outputDesc); std::vector filters; std::vector biases; - U32 biDirNum = (rnnPara.biDirection) ? 2 : 1; + U32 biDirNum = (rnnPara.bi_direction) ? 2 : 1; for (U32 i = 0; i < biDirNum; i++) { filters.push_back(*filterX); filters.push_back(*filterH); @@ -329,6 +333,7 @@ EE rnn_infer_forward_algorithm_mali(GCLHandle_t handle, CHECK_STATUS(NOT_SUPPORTED); } *forwardRunInfo = bestRunInfo; + gcl_set_runInfo_to_cache(handle, flag, bestRunInfo); CHECK_STATUS(gcl_finish(handle)); gcl_destroy_gclmem(input); gcl_destroy_gclmem(filterX); diff --git a/compute/tensor/src/gpu/mali/rnncell.cpp b/compute/tensor/src/gpu/mali/rnncell.cpp index d2780bae..95c886a2 100644 --- a/compute/tensor/src/gpu/mali/rnncell.cpp +++ b/compute/tensor/src/gpu/mali/rnncell.cpp @@ -32,7 +32,7 @@ inline void rnncell_produce_algos_paras(RNNParamSpec rnnPara, rnncellAlgorithms->push_back(CONVOLUTION_ALGORITHM_GEMM); CHECK_STATUS(get_gemv_cal_scheme(vecH, vecC, vecK)); algoNumIndex->push_back(vecH->size()); - if (rnnPara.numProjection) { + if (rnnPara.num_projection) { CHECK_STATUS(get_gemv_cal_scheme(vecHP, vecCP, vecKP)); algoNumIndexP->push_back(vecHP->size()); } @@ -61,7 +61,7 @@ inline EE rnncell_checkpara_mali(GCLHandle_t handle, if (iB != 1) { CHECK_STATUS(NOT_SUPPORTED); } - U32 hDim = rnnPara.numOutput; + U32 hDim = rnnPara.num_outputs; if (hDesc.dims[0] != hDim && hDesc.dims[1] != hDim) { CHECK_STATUS(NOT_MATCH); } @@ -88,6 +88,11 @@ EE rnncell_infer_forward_algorithm_mali(GCLHandle_t handle, if (algorithm != CONVOLUTION_ALGORITHM_NULL) { return SUCCESS; } + std::vector filterDescVec(1, filterDesc); + std::vector flag = build_rnn_forward_algorithm_flag(xDesc, filterDescVec, rnnPara); + if (gcl_get_runInfo_from_cache(handle, flag, forwardRunInfo)) { + return SUCCESS; + } std::vector rnncellAlgorithms; std::vector algoNumIndex; std::vector vecH; @@ -118,7 +123,7 @@ EE rnncell_infer_forward_algorithm_mali(GCLHandle_t handle, U32 offset[3] = {0, 0, 0}; U32 maxFilterSize[2] = {0, 0}; TensorDesc ftmDesc[2]; - bool useProject = (rnnPara.numProjection > 0) ? true : false; + bool useProject = (rnnPara.num_projection > 0) ? true : false; U32 filterNum = (useProject) ? 2 : 1; ForwardRunInfoMali runInfo; runInfo.algorithm = rnncellAlgorithms[0]; @@ -155,7 +160,7 @@ EE rnncell_infer_forward_algorithm_mali(GCLHandle_t handle, if (algosNum == 0) { CHECK_STATUS(NOT_SUPPORTED); } - U32 col = (useProject) ? rnnPara.numProjection : rnnPara.numOutput; + U32 col = (useProject) ? rnnPara.num_projection : rnnPara.num_outputs; stride[0] = col * 4; stride[1] = 1; stride[2] = 1; @@ -176,7 +181,7 @@ EE rnncell_infer_forward_algorithm_mali(GCLHandle_t handle, stride[2] = ftmDesc[1].dims[2]; CHECK_STATUS(gclmem_set_desc_padding( &filter1->desc, stride, offset, dt, DF_NCHW, GCL_MEM_BUF, CL_MEM_READ_WRITE)); - stride[0] = rnnPara.numOutput; + stride[0] = rnnPara.num_outputs; CHECK_STATUS(gclmem_set_desc_padding( &bias1->desc, stride, offset, dt, DF_NHWC, GCL_MEM_BUF, CL_MEM_READ_WRITE)); gcl_create_memory(handle, filter1); @@ -240,6 +245,7 @@ EE rnncell_infer_forward_algorithm_mali(GCLHandle_t handle, CHECK_STATUS(NOT_SUPPORTED); } *forwardRunInfo = bestRunInfo; + gcl_set_runInfo_to_cache(handle, flag, bestRunInfo); CHECK_STATUS(gcl_finish(handle)); gcl_destroy_gclmem(currentX); gcl_destroy_gclmem(state); diff --git a/compute/tensor/src/gpu/mali/roialign.cpp b/compute/tensor/src/gpu/mali/roialign.cpp index 96a5008e..77f914b7 100644 --- a/compute/tensor/src/gpu/mali/roialign.cpp +++ b/compute/tensor/src/gpu/mali/roialign.cpp @@ -43,7 +43,7 @@ inline EE roialign_checkpara_mali(GCLHandle_t handle, outputDesc.dims[3] != inputDescs[1].dims[1]) { CHECK_STATUS(NOT_MATCH) } - if (roiAlignParamSpec.coordinateTransformationMode != ROIALIGN_HALF_PIXEL) { + if (roiAlignParamSpec.trans_mode != COORDINATE_TRANS_HALF_PIXEL) { CHECK_STATUS(NOT_SUPPORTED); } return SUCCESS; diff --git a/compute/tensor/src/gpu/mali/space2depth.cpp b/compute/tensor/src/gpu/mali/space2depth.cpp index d15797b4..71b9481b 100644 --- a/compute/tensor/src/gpu/mali/space2depth.cpp +++ b/compute/tensor/src/gpu/mali/space2depth.cpp @@ -52,7 +52,7 @@ inline EE space2depth_core_mali_fp16(GCLHandle_t handle, inbuf = input->mem; outbuf = output->mem; bool useNchw = (inputDesc.df == DF_NCHWC4) ? false : true; - U32 blockSize = space2DepthPara.blockSize; + U32 blockSize = space2DepthPara.block_size; U32 gs[3] = {iw, ih, (ic + 3) / 4}; U32 ls[3] = {0, 0, 0}; @@ -87,7 +87,7 @@ EE space2depth_padding_input_mali(TensorDesc inputDesc, if (inputMem == nullptr || outputMem == nullptr || outputDesc == nullptr) { CHECK_STATUS(NULL_POINTER); } - U32 blockSize = space2DepthPara.blockSize; + U32 blockSize = space2DepthPara.block_size; DataType idt; DataFormat idf; U32 iw, ih, ic, in; diff --git a/compute/tensor/src/gpu/mali/transpose.cpp b/compute/tensor/src/gpu/mali/transpose.cpp index 6124bb7b..5306b846 100644 --- a/compute/tensor/src/gpu/mali/transpose.cpp +++ b/compute/tensor/src/gpu/mali/transpose.cpp @@ -27,7 +27,7 @@ EE transpose_padding_input_mali(TensorDesc inputDesc, if (outputDesc == nullptr || inputMem == nullptr || outputMem == nullptr) { CHECK_STATUS(NULL_POINTER); } - U32 *dim = p.trans_dims; + U32 *dim = p.axes; U32 dimTran[6] = {1, 1, 1, 1, 1, 1}; U32 nDims = inputDesc.nDims; for (U32 i = 0; i < nDims; ++i) { @@ -88,8 +88,7 @@ EE transpose_mali(GCLHandle_t handle, CHECK_STATUS(transpose_checkpara_mali(handle, inputDesc, input, outputDesc, output)); switch (inputDesc.dt) { case DT_F16: { - ret = transpose_mali_fp16( - handle, inputDesc, input, outputDesc, output, tmpbuf, p.trans_dims); + ret = transpose_mali_fp16(handle, inputDesc, input, outputDesc, output, tmpbuf, p.axes); break; } default: diff --git a/compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.cpp b/compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.cpp index 660d3c13..c71bcccd 100644 --- a/compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.cpp +++ b/compute/tensor/src/gpu/mali/uchar/bilateral_slice_apply_mali_uchar.cpp @@ -50,7 +50,7 @@ inline EE bilateral_slice_apply_core_mali_uchar(GCLHandle_t handle, tensorSelectGet(gridDesc, NULL, NULL, &gn, &gc, &gh, &gw); tensorSelectGet(outputDesc, NULL, NULL, &on, &oc, &oh, &ow); - U32 coe = bilateralSliceApplyParamSpec.coefficient_len; + U32 coe = bilateralSliceApplyParamSpec.coefficient; BilateralSliceApplyMode mode = bilateralSliceApplyParamSpec.mode; U32 dep = gc / coe; U32 gcw = gc * gw; @@ -62,7 +62,7 @@ inline EE bilateral_slice_apply_core_mali_uchar(GCLHandle_t handle, gridbuf = grid->mem; outbuf = output->mem; gridTran = tmpBuf->mem; - if (mode == BSliceApply_NULL) { + if (mode == BSLICE_APPLY_NULL) { guidebuf = guide->mem; } else { guidebuf = inbuf; @@ -85,11 +85,12 @@ inline EE bilateral_slice_apply_core_mali_uchar(GCLHandle_t handle, U32 gs[2] = {ow, oh}; U32 ls[2] = {0, 0}; U32 dim = 2; - char kernelname[128]; - if (mode == BSliceApply_CONV) { - sprintf(kernelname, "bilateral_slice_apply_c12_conv_uchar"); + const char *kernelname; + if (mode == BSLICE_APPLY_CONV) { + kernelname = "bilateral_slice_apply_c12_conv_uchar"; + ; } else { - sprintf(kernelname, "bilateral_slice_apply_c12_uchar"); + kernelname = "bilateral_slice_apply_c12_uchar"; } CHECK_STATUS(gcl_create_kernel(handle, kernelname, &kernel)); CHECK_STATUS(gcl_set_kernelArgs(kernel, iw, wh, gc, gw, gh, gcw, dep, coe, gs[0], gs[1], @@ -100,7 +101,7 @@ inline EE bilateral_slice_apply_core_mali_uchar(GCLHandle_t handle, CHECK_STATUS(gcl_run_kernel_profiling(handle, kernel, dim, gs, ls, kernelname)); CHECK_STATUS(gcl_print_memory(handle, input, "bilateral_slice_apply_input")); CHECK_STATUS(gcl_print_memory(handle, output, "bilateral_slice_apply_output")); - if (mode == BSliceApply_NULL) { + if (mode == BSLICE_APPLY_NULL) { CHECK_STATUS(gcl_print_memory(handle, guide, "bilateral_slice_apply_guide")); } #endif diff --git a/compute/tensor/src/kl.cpp b/compute/tensor/src/kl.cpp index 07e5912e..4cec2e93 100644 --- a/compute/tensor/src/kl.cpp +++ b/compute/tensor/src/kl.cpp @@ -143,8 +143,6 @@ std::vector compute_scale_with_KL(std::vector &histogram, F32 interval } } } - F32 qSum = sum_func(DT_F32, qExpand.data(), i); - scale_func(DT_F32, qExpand.data(), qExpand.data(), i, 1 / qSum, 0); F32 kld = compute_KLD(i, clipDist.data(), qExpand.data()); if (kld < minKLD) { diff --git a/compute/tensor/src/matmul.cpp b/compute/tensor/src/matmul.cpp index d04dda5b..5d039ebb 100644 --- a/compute/tensor/src/matmul.cpp +++ b/compute/tensor/src/matmul.cpp @@ -13,7 +13,7 @@ #include "tensor_computing.h" #include "blas_enhance.h" -#include + #ifdef _USE_GPU #include "gpu/mali/tensor_computing_mali.h" #endif @@ -169,6 +169,38 @@ inline bool useINT8Type(DataType aDt, DataType bDt, DataType cDt, I32 flag) DT_I8 == cDt || flag != 0); } +EE mmm_infer_forward_tmp_bytes(U32 *bytes, + U32 kDimA, + U32 kDimB, + DataFormat dataFormatA, + DataFormat dataFormatB, + TensorDesc matrixADesc, + TensorDesc matrixBDesc, + Arch arch) +{ + EE ret = NOT_SUPPORTED; + if (matrixADesc.dims[1 - kDimA] == 1 || matrixBDesc.dims[1 - kDimB] == 1) { + TensorDesc matrixDesc, vectorDesc; + if (matrixADesc.dims[1 - kDimA] == 1) { + matrixDesc = + tensor2df(matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]); + vectorDesc = tensor1d(matrixADesc.dt, matrixADesc.dims[kDimA]); + } else { + matrixDesc = + tensor2df(matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]); + vectorDesc = tensor1d(matrixBDesc.dt, matrixBDesc.dims[kDimB]); + } + ret = matrix_vector_multiply_tmp_bytes(matrixDesc, vectorDesc, bytes, arch); + } else { + TensorDesc matrixA2DDesc = + tensor2df(matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]); + TensorDesc matrixB2Ddesc = + tensor2df(matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]); + ret = matrix_matrix_multiply_tmp_bytes(matrixA2DDesc, matrixB2Ddesc, bytes, arch); + } + return ret; +} + EE matmul_infer_forward_tmp_bytes(Tensor matrixATensor, bool transposeA, Tensor matrixBTensor, @@ -247,25 +279,12 @@ EE matmul_infer_forward_tmp_bytes(Tensor matrixATensor, kDimB = 1; dataFormatB = DF_NORMAL; } - if (matrixADesc.dims[1 - kDimA] == 1 || matrixBDesc.dims[1 - kDimB] == 1) { - TensorDesc matrixDesc, vectorDesc; - if (matrixADesc.dims[1 - kDimA] == 1) { - matrixDesc = - tensor2df(matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]); - vectorDesc = tensor1d(matrixADesc.dt, matrixADesc.dims[kDimA]); - } else { - matrixDesc = - tensor2df(matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]); - vectorDesc = tensor1d(matrixBDesc.dt, matrixBDesc.dims[kDimB]); - } - ret = matrix_vector_multiply_tmp_bytes(matrixDesc, vectorDesc, bytes, archInfo->arch); - } else { - TensorDesc matrixA2DDesc = - tensor2df(matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]); - TensorDesc matrixB2Ddesc = - tensor2df(matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]); - ret = matrix_matrix_multiply_tmp_bytes(matrixA2DDesc, matrixB2Ddesc, bytes, archInfo->arch); - } + mmm_infer_forward_tmp_bytes( + bytes, kDimA, kDimB, dataFormatA, dataFormatB, matrixADesc, matrixBDesc, archInfo->arch); +#ifdef _USE_OPENMP + U32 loopsC = tensorNumElements(matrixCDesc) / (matrixCDesc.dims[1] * matrixCDesc.dims[0]); + *bytes *= loopsC; +#endif if (quantA) { *bytes += tensorNumBytes(matrixADesc); @@ -396,112 +415,122 @@ EE matmul(Tensor matrixATensor, #endif U32 kDimA, kDimB; + DataFormat dataFormatA, dataFormatB; if (transposeA) { kDimA = 1; + dataFormatA = DF_TRANSPOSE; } else { kDimA = 0; + dataFormatA = DF_NORMAL; } if (transposeB) { kDimB = 0; + dataFormatB = DF_TRANSPOSE; } else { kDimB = 1; + dataFormatB = DF_NORMAL; } + align_input_desc(&matrixADesc, &matrixBDesc); + std::vector p = {(U8 *)matrixA, (U8 *)matrixB, (U8 *)matrixC, (U8 *)tmp}; - U32 matrixA2DBytes = (matrixADesc.dims[1] * matrixADesc.dims[0]) * bytesOf(matrixADesc.dt); - U32 matrixB2DBytes = (matrixBDesc.dims[1] * matrixBDesc.dims[0]) * bytesOf(matrixBDesc.dt); - U32 matrixC2DBytes = (matrixCDesc.dims[1] * matrixCDesc.dims[0]) * bytesOf(matrixCDesc.dt); if (biasTensor.bytes() > 0) { U8 *bias = (U8 *)get_ptr_from_tensor(biasTensor, arch); for (U32 i = 0; i < tensorNumBytes(matrixCDesc) / biasTensor.bytes(); i++) { - memcpy((U8 *)matrixC + i * biasTensor.bytes(), bias, biasTensor.bytes()); + UNI_MEMCPY((U8 *)matrixC + i * biasTensor.bytes(), bias, biasTensor.bytes()); } } else { - memset(matrixC, 0, tensorNumBytes(matrixCDesc)); + UNI_MEMSET(matrixC, 0, tensorNumBytes(matrixCDesc)); } - std::vector ADims, BDims, CDims; - U32 loopsA = tensorNumElements(matrixADesc) / (matrixADesc.dims[1] * matrixADesc.dims[0]); - U32 loopsB = tensorNumElements(matrixBDesc) / (matrixBDesc.dims[1] * matrixBDesc.dims[0]); - U32 loopsC = tensorNumElements(matrixCDesc) / (matrixCDesc.dims[1] * matrixCDesc.dims[0]); - align_input_desc(&matrixADesc, &matrixBDesc); - U32 ia, ib; - for (U32 ic = 0; ic < loopsC; ic++) { - CDims = calculateLocalIndex(ic, matrixCDesc.dims + 2, matrixCDesc.nDims - 2); - if (loopsA == loopsC) { - ia = ic; - } else { - ADims = CDims; - for (U32 i = 2; i < matrixADesc.nDims; i++) { - if (ADims[i - 2] >= matrixADesc.dims[i]) { - ADims[i - 2] = 0; + + U32 mmmBytes = 0; +#if defined(_USE_OPENMP) && defined(_USE_CPU) + mmm_infer_forward_tmp_bytes(&mmmBytes, kDimA, kDimB, dataFormatA, dataFormatB, matrixADesc, + matrixBDesc, archInfo->arch); +#pragma omp parallel num_threads(OMP_NUM_THREADS) +#endif + { + U32 matrixA2DBytes = (matrixADesc.dims[1] * matrixADesc.dims[0]) * bytesOf(matrixADesc.dt); + U32 matrixB2DBytes = (matrixBDesc.dims[1] * matrixBDesc.dims[0]) * bytesOf(matrixBDesc.dt); + U32 matrixC2DBytes = (matrixCDesc.dims[1] * matrixCDesc.dims[0]) * bytesOf(matrixCDesc.dt); + U32 loopsA = tensorNumElements(matrixADesc) / (matrixADesc.dims[1] * matrixADesc.dims[0]); + U32 loopsB = tensorNumElements(matrixBDesc) / (matrixBDesc.dims[1] * matrixBDesc.dims[0]); + U32 loopsC = tensorNumElements(matrixCDesc) / (matrixCDesc.dims[1] * matrixCDesc.dims[0]); +#if defined(_USE_OPENMP) +#pragma omp for +#endif + for (U32 ic = 0; ic < loopsC; ic++) { + U32 ia, ib; + std::vector ADims, BDims, CDims; + U8 *tmpPtr = p[3] + ic * mmmBytes; + CDims = calculateLocalIndex(ic, matrixCDesc.dims + 2, matrixCDesc.nDims - 2); + if (loopsA == loopsC) { + ia = ic; + } else { + ADims = CDims; + for (U32 i = 2; i < matrixADesc.nDims; i++) { + if (ADims[i - 2] >= matrixADesc.dims[i]) { + ADims[i - 2] = 0; + } } + ia = calculateGlobalIndex(ADims.data(), matrixADesc.dims + 2, matrixADesc.nDims - 2); } - ia = calculateGlobalIndex(ADims.data(), matrixADesc.dims + 2, matrixADesc.nDims - 2); - } - if (loopsB == loopsC) { - ib = ic; - } else { - BDims = CDims; - for (U32 i = 2; i < matrixBDesc.nDims; i++) { - if (BDims[i - 2] >= matrixBDesc.dims[i]) { - BDims[i - 2] = 0; + if (loopsB == loopsC) { + ib = ic; + } else { + BDims = CDims; + for (U32 i = 2; i < matrixBDesc.nDims; i++) { + if (BDims[i - 2] >= matrixBDesc.dims[i]) { + BDims[i - 2] = 0; + } } + ib = calculateGlobalIndex(BDims.data(), matrixBDesc.dims + 2, matrixBDesc.nDims - 2); } - ib = calculateGlobalIndex(BDims.data(), matrixBDesc.dims + 2, matrixBDesc.nDims - 2); - } - U8 *matrixAPtr = (U8 *)matrixA + ia * matrixA2DBytes; - U8 *matrixBPtr = (U8 *)matrixB + ib * matrixB2DBytes; - U8 *matrixCPtr = (U8 *)matrixC + ic * matrixC2DBytes; - if (matrixADesc.dims[1 - kDimA] == 1) { - TensorDesc matrixA1DDesc = tensor1d(matrixADesc.dt, matrixADesc.dims[kDimA]); - TensorDesc matrixB2DDesc = tensor2df(matrixBDesc.dt, - transposeB ? DF_NORMAL : DF_TRANSPOSE, matrixBDesc.dims[1], matrixBDesc.dims[0]); - TensorDesc matrixC1DDesc = tensor1d(matrixCDesc.dt, matrixCDesc.dims[0]); + U8 *matrixAPtr = p[0] + ia * matrixA2DBytes; + U8 *matrixBPtr = p[1] + ib * matrixB2DBytes; + U8 *matrixCPtr = p[2] + ic * matrixC2DBytes; + if (matrixADesc.dims[1 - kDimA] == 1) { + TensorDesc matrixA1DDesc = tensor1d(matrixADesc.dt, matrixADesc.dims[kDimA]); + TensorDesc matrixB2DDesc = tensor2df(matrixBDesc.dt, + transposeB ? DF_NORMAL : DF_TRANSPOSE, matrixBDesc.dims[1], matrixBDesc.dims[0]); + TensorDesc matrixC1DDesc = tensor1d(matrixCDesc.dt, matrixCDesc.dims[0]); - CHECK_STATUS(matrix_vector_multiply(matrixB2DDesc, matrixBPtr, matrixA1DDesc, - matrixAPtr, tmpBytes, tmp, matrixC1DDesc, matrixCPtr, scalePtr, archInfo->arch)); - } else { - if (matrixBDesc.dims[1 - kDimB] == 1) { - TensorDesc matrixA2DDesc; - if (transposeA) { - matrixA2DDesc = tensor2df( - matrixADesc.dt, DF_TRANSPOSE, matrixADesc.dims[1], matrixADesc.dims[0]); - } else { - matrixA2DDesc = tensor2df( - matrixADesc.dt, DF_NORMAL, matrixADesc.dims[1], matrixADesc.dims[0]); - } - TensorDesc matrixB1DDesc = tensor1d(matrixBDesc.dt, matrixBDesc.dims[kDimB]); - TensorDesc matrixC1DDesc = tensor1d(matrixCDesc.dt, matrixCDesc.dims[1]); - - CHECK_STATUS(matrix_vector_multiply(matrixA2DDesc, matrixAPtr, matrixB1DDesc, - matrixBPtr, tmpBytes, tmp, matrixC1DDesc, matrixCPtr, scalePtr, archInfo->arch)); + CHECK_STATUS( + matrix_vector_multiply(matrixB2DDesc, matrixBPtr, matrixA1DDesc, matrixAPtr, + tmpBytes, tmpPtr, matrixC1DDesc, matrixCPtr, scalePtr, archInfo->arch)); } else { - DataFormat dataFormatA, dataFormatB; - if (transposeA) { - dataFormatA = DF_TRANSPOSE; - } else { - dataFormatA = DF_NORMAL; - } - if (transposeB) { - dataFormatB = DF_TRANSPOSE; + if (matrixBDesc.dims[1 - kDimB] == 1) { + TensorDesc matrixA2DDesc; + if (transposeA) { + matrixA2DDesc = tensor2df( + matrixADesc.dt, DF_TRANSPOSE, matrixADesc.dims[1], matrixADesc.dims[0]); + } else { + matrixA2DDesc = tensor2df( + matrixADesc.dt, DF_NORMAL, matrixADesc.dims[1], matrixADesc.dims[0]); + } + TensorDesc matrixB1DDesc = tensor1d(matrixBDesc.dt, matrixBDesc.dims[kDimB]); + TensorDesc matrixC1DDesc = tensor1d(matrixCDesc.dt, matrixCDesc.dims[1]); + + CHECK_STATUS( + matrix_vector_multiply(matrixA2DDesc, matrixAPtr, matrixB1DDesc, matrixBPtr, + tmpBytes, tmpPtr, matrixC1DDesc, matrixCPtr, scalePtr, archInfo->arch)); } else { - dataFormatB = DF_NORMAL; - } - TensorDesc matrixA2DDesc = tensor2df( - matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]); - TensorDesc matrixB2DDesc = tensor2df( - matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]); - TensorDesc matrixC2DDesc = - tensor2df(matrixCDesc.dt, DF_NORMAL, matrixCDesc.dims[1], matrixCDesc.dims[0]); + TensorDesc matrixA2DDesc = tensor2df( + matrixADesc.dt, dataFormatA, matrixADesc.dims[1], matrixADesc.dims[0]); + TensorDesc matrixB2DDesc = tensor2df( + matrixBDesc.dt, dataFormatB, matrixBDesc.dims[1], matrixBDesc.dims[0]); + TensorDesc matrixC2DDesc = tensor2df( + matrixCDesc.dt, DF_NORMAL, matrixCDesc.dims[1], matrixCDesc.dims[0]); #if defined(_USE_X86) && defined(_USE_INT8) - memset(tmp, 0, matrixCDesc.dims[0] * bytesOf(DT_I32)); + UNI_MEMSET(tmpPtr, 0, matrixCDesc.dims[0] * bytesOf(DT_I32)); #endif - CHECK_STATUS(matrix_matrix_multiply(matrixA2DDesc, matrixAPtr, matrixB2DDesc, - matrixBPtr, tmpBytes, tmp, matrixC2DDesc, matrixCPtr, scalePtr, archInfo->arch)); + CHECK_STATUS( + matrix_matrix_multiply(matrixA2DDesc, matrixAPtr, matrixB2DDesc, matrixBPtr, + tmpBytes, tmpPtr, matrixC2DDesc, matrixCPtr, scalePtr, archInfo->arch)); + } } } } - #ifdef _USE_INT8 if (useINT8 && (matrixCTensor.get_desc().dt != matrixCDesc.dt)) { if (DT_I8 == matrixCTensor.get_desc().dt || DT_U8_Q == matrixCTensor.get_desc().dt) { diff --git a/compute/tensor/src/non_max_suppression.cpp b/compute/tensor/src/non_max_suppression.cpp index cf04825f..c77cb482 100644 --- a/compute/tensor/src/non_max_suppression.cpp +++ b/compute/tensor/src/non_max_suppression.cpp @@ -35,15 +35,14 @@ inline EE non_max_suppression_infer_output_size_cpu( CHECK_REQUIREMENT(p.max_output_boxes_per_class != 0); // output size U32 oh, ow; - // oh = the first box for saving the number of available boxes(1) + the maximum number of dectected boxes(max_output_boxes_per_class * num_class) + // oh = the first box for saving the maximum number of dectected boxes(max_output_boxes_per_class * num_class) U32 max_output_boxes_per_class = p.max_output_boxes_per_class; U32 num_class = ic1; U32 num_detected_max = max_output_boxes_per_class * num_class; - oh = num_detected_max + 1; + oh = num_detected_max; // Each width is a 3 dimension vector, which stores [batch_index, class_index, box_index] -> 3 - // The first box is [ number of available boxes, 0, 0 ] ow = 3; - *outputDesc = tensor2d(idt0, oh, ow); + *outputDesc = tensor2d(DT_I32, oh, ow); return SUCCESS; } @@ -77,7 +76,10 @@ EE non_max_suppression(std::vector inputTensor, EE ret = NOT_SUPPORTED; if (IS_CPU(arch)) { #ifdef _USE_CPU - ret = non_max_suppression_cpu(inputDesc, input, p, outputDesc, output); + U32 length = 0; + ret = non_max_suppression_cpu(inputDesc, input, p, outputDesc, output, &length); + outputDesc.dims[1] = length; + outputTensor.resize(outputDesc); #endif } return ret; diff --git a/compute/tensor/src/non_zero.cpp b/compute/tensor/src/non_zero.cpp new file mode 100644 index 00000000..fe8c443e --- /dev/null +++ b/compute/tensor/src/non_zero.cpp @@ -0,0 +1,34 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif + +EE non_zero(Tensor inputTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { + U32 length = 0; + ret = non_zero_cpu(inputDesc, input, outputDesc, output, &length); + outputDesc.dims[0] = length; + outputTensor.resize(outputDesc); + } + return ret; +} diff --git a/compute/tensor/src/normalization.cpp b/compute/tensor/src/normalization.cpp index 4bf343e6..7115d500 100644 --- a/compute/tensor/src/normalization.cpp +++ b/compute/tensor/src/normalization.cpp @@ -29,6 +29,7 @@ #endif EE layer_normalization(Tensor inputTensor, + LayerNormParamSpec p, Tensor alphaTensor, Tensor betaTensor, Tensor tmpTensor, @@ -54,22 +55,28 @@ EE layer_normalization(Tensor inputTensor, EE ret = NOT_SUPPORTED; if (IS_GENERAL(arch)) { #ifdef _USE_GENERAL - ret = layer_normalization_general(inputDesc, input, alpha, beta, outputDesc, output); + ret = layer_normalization_general(inputDesc, input, p, alpha, beta, outputDesc, output); #endif #ifdef _USE_X86 } else if (IS_X86(arch)) { - ret = layer_normalization_x86(inputDesc, input, alpha, beta, outputDesc, output); + ret = layer_normalization_x86(inputDesc, input, p, alpha, beta, outputDesc, output); #endif #ifdef _USE_NEON } else if (IS_ARM(arch)) { - ret = layer_normalization_arm(inputDesc, input, alpha, beta, outputDesc, output); + ret = layer_normalization_arm(inputDesc, input, p, alpha, beta, outputDesc, output); #endif #ifdef _USE_GPU } else if (IS_GPU(arch)) { void *tmpbuf = get_ptr_from_tensor(tmpTensor, arch); - ret = layer_normalization_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, - (GCLMem_t)input, (GCLMem_t)alpha, (GCLMem_t)beta, (GCLMem_t)tmpbuf, outputDesc, - (GCLMem_t)output); + if (p.axis == -1) { + ret = layer_normalization_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (GCLMem_t)input, (GCLMem_t)alpha, (GCLMem_t)beta, (GCLMem_t)tmpbuf, outputDesc, + (GCLMem_t)output); + } else { + UNI_WARNING_LOG("please close optimizeTransposeLN in " + "model_tools/include/OPOptimizers/LayerNormOptimizer.hpp and " + "reconverter model.\n"); + } #endif } @@ -97,10 +104,7 @@ EE normalization_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, Ar if (outputTensor == nullptr) { CHECK_STATUS(NULL_POINTER); } - TensorDesc inputDesc = inputTensor->get_desc(); - TensorDesc outputDesc = outputTensor->get_desc(); - outputDesc = inputDesc; - outputTensor->resize(outputDesc); + outputTensor->resize(inputTensor->get_desc()); return SUCCESS; } @@ -109,13 +113,15 @@ EE normalization_infer_forward_tmp_bytes(Tensor inputTensor, U32 *bytes, ArchInf if (bytes == nullptr) { CHECK_STATUS(NULL_POINTER); } + EE ret = NOT_SUPPORTED; if (IS_GPU(archInfo->arch)) { #ifdef _USE_GPU GCLMemDesc gclmemInputDesc = ocl_get_desc(inputTensor); - CHECK_STATUS(normalization_infer_forward_tmp_bytes_mali(gclmemInputDesc, bytes)); + ret = normalization_infer_forward_tmp_bytes_mali(gclmemInputDesc, bytes); #endif } else { *bytes = 0; + ret = SUCCESS; } - return SUCCESS; + return ret; } diff --git a/compute/tensor/src/onehot.cpp b/compute/tensor/src/onehot.cpp new file mode 100644 index 00000000..2530bc83 --- /dev/null +++ b/compute/tensor/src/onehot.cpp @@ -0,0 +1,53 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif + +EE onehot_infer_output_size( + Tensor *inputTensor, OneHotParamSpec p, DataType type, Tensor *outputTensor, ArchInfo_t archInfo) +{ + if (inputTensor == nullptr || outputTensor == nullptr) { + CHECK_STATUS(NULL_POINTER); + } + TensorDesc inputDesc = inputTensor->get_desc(); + TensorDesc outputDesc = inputDesc; + outputDesc.dt = type; + outputDesc.nDims++; + int axis = (p.axis + outputDesc.nDims) % outputDesc.nDims; + axis = outputDesc.nDims - 1 - axis; + for (U32 i = axis + 1; i < outputDesc.nDims; i++) { + outputDesc.dims[i] = outputDesc.dims[i - 1]; + } + outputDesc.dims[axis] = p.depth; + outputTensor->resize(outputDesc); + return SUCCESS; +} + +EE onehot(Tensor inputTensor, OneHotParamSpec p, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = onehot_cpu(inputDesc, input, p, outputDesc, output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/pooling.cpp b/compute/tensor/src/pooling.cpp index b2949bf9..286b1921 100644 --- a/compute/tensor/src/pooling.cpp +++ b/compute/tensor/src/pooling.cpp @@ -46,41 +46,31 @@ inline EE pooling_infer_output_size_cpu( CHECK_STATUS(NOT_SUPPORTED); return NOT_SUPPORTED; } - RoundMode rm = p.rm; + RoundMode rm = p.round_mode; U32 ot = 0, oh = 0, ow = 0; EE ret = SUCCESS; switch (rm) { - case CEIL: { - ot = (U32)(ceil( - (double(it + p.padding_before + p.padding_after - p.kernel_t) / p.stride_t))) + - 1; - oh = (U32)(ceil( - (double(ih + p.padding_top + p.padding_bottom - p.kernel_h) / p.stride_h))) + - 1; - ow = (U32)(ceil( - (double(iw + p.padding_left + p.padding_right - p.kernel_w) / p.stride_w))) + + case ROUND_CEIL: { + ot = (U32)(ceil((double(it + p.pad_before + p.pad_after - p.kernel_t) / p.stride_t))) + 1; + oh = (U32)(ceil((double(ih + p.pad_top + p.pad_bottom - p.kernel_h) / p.stride_h))) + 1; + ow = (U32)(ceil((double(iw + p.pad_left + p.pad_right - p.kernel_w) / p.stride_w))) + 1; break; } - case FLOOR: { - ot = (U32)(floor( - (double(it + p.padding_before + p.padding_after - p.kernel_t) / p.stride_t))) + - 1; - oh = (U32)(floor( - (double(ih + p.padding_top + p.padding_bottom - p.kernel_h) / p.stride_h))) + - 1; - ow = (U32)(floor( - (double(iw + p.padding_left + p.padding_right - p.kernel_w) / p.stride_w))) + + case ROUND_FLOOR: { + ot = (U32)(floor((double(it + p.pad_before + p.pad_after - p.kernel_t) / p.stride_t))) + 1; + oh = (U32)(floor((double(ih + p.pad_top + p.pad_bottom - p.kernel_h) / p.stride_h))) + 1; + ow = (U32)(floor((double(iw + p.pad_left + p.pad_right - p.kernel_w) / p.stride_w))) + 1; break; } - case TF_SAME: { + case ROUND_TF_SAME: { ot = (U32)(ceil((double(it) / p.stride_t))); oh = (U32)(ceil((double(ih) / p.stride_h))); ow = (U32)(ceil((double(iw) / p.stride_w))); break; } - case TF_VALID: { + case ROUND_TF_VALID: { ot = (U32)(ceil((double(it - p.kernel_t + 1) / p.stride_t))); oh = (U32)(ceil((double(ih - p.kernel_h + 1) / p.stride_h))); ow = (U32)(ceil((double(iw - p.kernel_w + 1) / p.stride_w))); @@ -91,16 +81,46 @@ inline EE pooling_infer_output_size_cpu( break; } } + DataFormat odf = idf; + if (idt == DT_U8_Q) { + odf = DF_NCHWC16; + } if (tensorIs3d(inputDesc)) { - *outputDesc = tensor3df(idt, idf, in, ic, oh); + *outputDesc = tensor3df(idt, odf, in, ic, oh); } else if (tensorIs4d(inputDesc)) { - *outputDesc = tensor4df(idt, idf, in, ic, oh, ow); + *outputDesc = tensor4df(idt, odf, in, ic, oh, ow); } else if (tensorIs5d(inputDesc)) { - *outputDesc = tensor5df(idt, idf, in, ic, ot, oh, ow); + *outputDesc = tensor5df(idt, odf, in, ic, ot, oh, ow); } return ret; } +static inline PoolingParamSpec update_param(TensorDesc inDesc, PoolingParamSpec poolingParamSpec) +{ + if (0 == poolingParamSpec.kernel_w) { + if (inDesc.nDims > 3) { + poolingParamSpec.kernel_w = inDesc.dims[0]; + } else { + poolingParamSpec.kernel_w = 1; + } + } + if (0 == poolingParamSpec.kernel_h) { + if (inDesc.nDims > 3) { + poolingParamSpec.kernel_h = inDesc.dims[1]; + } else { + poolingParamSpec.kernel_h = inDesc.dims[0]; + } + } + if (0 == poolingParamSpec.kernel_t) { + if (inDesc.nDims > 4) { + poolingParamSpec.kernel_t = inDesc.dims[2]; + } else { + poolingParamSpec.kernel_t = 1; + } + } + return poolingParamSpec; +} + EE pooling_infer_output_size( Tensor *inputTensor, PoolingParamSpec poolingParamSpec, Tensor *outputTensor, ArchInfo_t archInfo) { @@ -110,15 +130,7 @@ EE pooling_infer_output_size( TensorDesc inputDesc = inputTensor->get_desc(); TensorDesc newInputDesc = transformDescTo4d(inputDesc); TensorDesc outputDesc = outputTensor->get_desc(); - if (0 == poolingParamSpec.kernel_w) { - poolingParamSpec.kernel_w = newInputDesc.dims[0]; - } - if (0 == poolingParamSpec.kernel_h) { - poolingParamSpec.kernel_h = newInputDesc.dims[1]; - } - if (0 == poolingParamSpec.kernel_t) { - poolingParamSpec.kernel_t = newInputDesc.dims[2]; - } + poolingParamSpec = update_param(newInputDesc, poolingParamSpec); CHECK_STATUS(pooling_infer_output_size_cpu(inputDesc, poolingParamSpec, &outputDesc)); if (IS_GPU(archInfo->arch)) { #ifdef _USE_GPU @@ -145,55 +157,74 @@ EE pooling(Tensor inputTensor, void *output = get_ptr_from_tensor(outputTensor, arch); F32 scale[2] = {inputTensor.get_scale(), -1}; void *tmp = get_ptr_from_tensor(tmpTensor, arch); - - if (0 == poolingParamSpec.kernel_w) { - poolingParamSpec.kernel_w = inputDesc.dims[0]; - } - if (0 == poolingParamSpec.kernel_h) { - poolingParamSpec.kernel_h = inputDesc.dims[1]; - } - if (0 == poolingParamSpec.kernel_t) { - poolingParamSpec.kernel_t = inputDesc.dims[2]; - } - TensorDesc inDescCPU = inputDesc; - U8 *inputCPU = (U8 *)input; - TensorDesc outDescCPU = outputDesc; - U8 *outputCPU = (U8 *)output; - if (DF_NCHWC16 != inputDesc.df && DF_NCHWC8 != inputDesc.df && IS_CPU(arch)) { - int channelAxis = inputDesc.nDims - 2; - U32 paddedC = (inputDesc.dims[channelAxis] + 7) / 8 * 8; - inDescCPU.dims[channelAxis] = paddedC; - inDescCPU.df = DF_NCHWC8; - outDescCPU.dims[channelAxis] = paddedC; - outDescCPU.df = DF_NCHWC8; - inputCPU = (U8 *)tmp; - outputCPU = inputCPU + tensorNumBytes(inDescCPU); - transformNCHWToNCHWC8(inputDesc, input, inDescCPU, inputCPU); - } + poolingParamSpec = update_param(inputDesc, poolingParamSpec); EE ret = NOT_SUPPORTED; - if (IS_GENERAL(arch)) { + if (IS_GPU(arch)) { +#ifdef _USE_GPU + ret = pooling_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, + (const GCLMem_t)input, poolingParamSpec, scale, (GCLMem_t)tmp, outputDesc, + (GCLMem_t)output); +#endif + } else if (IS_CPU(arch)) { +#ifdef _USE_CPU + U8 *inputCPU = (U8 *)input; + U8 *outputCPU = (U8 *)output; + TensorDesc inDescCPU = inputDesc; + TensorDesc outDescCPU = outputDesc; + DataFormat dstF = outputDesc.df; + int channelAxis = inputDesc.nDims - 2; + + U32 cx = 8; + if (IS_X86(arch)) { + if (dstF == DF_NCHW || dstF == DF_MTK) { + cx = 1; + } + if (inputDesc.dt == DT_U8_Q) { + dstF = DF_NCHWC16; // padding to 16 + cx = 16; + } + } else { + dstF = DF_NCHWC8; + } + + U32 paddedC = (inputDesc.dims[channelAxis] + cx - 1) / cx * cx; + + if (paddedC != inputDesc.dims[channelAxis] || (inputDesc.df != dstF)) { + inDescCPU.dims[channelAxis] = paddedC; + inDescCPU.df = dstF; + inputCPU = (U8 *)tmp; + tmp = (U8 *)tmp + tensorNumBytes(inDescCPU); + transformFormat(inputDesc, input, inDescCPU, inputCPU); + } + + if (paddedC != inputDesc.dims[channelAxis] || (outputDesc.df != dstF)) { + outDescCPU.dims[channelAxis] = paddedC; + outDescCPU.df = dstF; + outputCPU = (U8 *)tmp; + } + + if (IS_GENERAL(arch)) { #ifdef _USE_GENERAL - ret = pooling_general(inDescCPU, inputCPU, poolingParamSpec, outDescCPU, outputCPU); + ret = pooling_general( + inDescCPU, inputCPU, poolingParamSpec, scale, outDescCPU, outputCPU); #endif #ifdef _USE_X86 - } else if (IS_X86(arch)) { - ret = pooling_x86(inDescCPU, inputCPU, poolingParamSpec, scale, outDescCPU, outputCPU); + } else if (IS_X86(arch)) { + ret = pooling_x86(inDescCPU, inputCPU, poolingParamSpec, scale, outDescCPU, outputCPU); #endif #ifdef _USE_NEON - } else if (IS_ARM(arch)) { - ret = pooling_arm(inDescCPU, inputCPU, poolingParamSpec, scale, outDescCPU, outputCPU); + } else if (IS_ARM(arch)) { + ret = pooling_arm(inDescCPU, inputCPU, poolingParamSpec, scale, outDescCPU, outputCPU); #endif -#ifdef _USE_GPU - } else if (IS_GPU(arch)) { - ret = pooling_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, - (const GCLMem_t)input, poolingParamSpec, scale, (GCLMem_t)tmp, outputDesc, - (GCLMem_t)output); + } + + if (paddedC != inputDesc.dims[channelAxis] || (outputDesc.df != outDescCPU.df)) { + transformFormat(outDescCPU, outputCPU, outputDesc, output); + } + outputTensor.set_scale(scale[1]); #endif } - if (DF_NCHWC16 != inputDesc.df && DF_NCHWC8 != outputDesc.df && IS_CPU(arch)) { - transformToNCHW(outDescCPU, outputCPU, outputDesc, output); - } - outputTensor.set_scale(scale[1]); + return ret; } @@ -213,13 +244,32 @@ EE pooling_infer_forward_tmp_bytes( } else { *bytes = 0; ret = SUCCESS; - if (DF_NCHW == inputDesc.df) { - int channelAxis = inputDesc.nDims - 2; - U32 paddedC = (inputDesc.dims[channelAxis] + 7) / 8 * 8; - TensorDesc outputDesc = outputTensor.get_desc(); + + TensorDesc outputDesc = transformDescTo4d(outputTensor.get_desc()); + DataFormat dstF = outputDesc.df; + int channelAxis = inputDesc.nDims - 2; + U32 cx = 8; + if (IS_X86(archInfo->arch)) { + if (dstF == DF_NCHW || dstF == DF_MTK) { + cx = 1; + } + if (inputDesc.dt == DT_U8_Q) { + dstF = DF_NCHWC16; // padding to 16 + cx = 16; + } + } else { + dstF = DF_NCHWC8; + } + U32 paddedC = (inputDesc.dims[channelAxis] + cx - 1) / cx * cx; + + if (paddedC != inputDesc.dims[channelAxis] || (inputDesc.df != dstF)) { inputDesc.dims[channelAxis] = paddedC; + *bytes += tensorNumBytes(inputDesc); + } + + if (paddedC != outputDesc.dims[channelAxis] || (outputDesc.df != dstF)) { outputDesc.dims[channelAxis] = paddedC; - *bytes = tensorNumBytes(inputDesc) + tensorNumBytes(outputDesc); + *bytes += tensorNumBytes(outputDesc); } } return ret; diff --git a/compute/tensor/src/power.cpp b/compute/tensor/src/power.cpp index 42658b5e..4e10838f 100644 --- a/compute/tensor/src/power.cpp +++ b/compute/tensor/src/power.cpp @@ -19,28 +19,31 @@ #include "gpu/mali/tensor_computing_mali.h" #endif -inline EE power_infer_output_size_cpu(TensorDesc inputDesc, TensorDesc *outputDesc) +inline EE power_infer_output_size_cpu( + TensorDesc inputDesc, PowerParamSpec p, TensorDesc *outputDesc, Arch arch) { - if (nullptr == outputDesc) { - CHECK_STATUS(NULL_POINTER); - } *outputDesc = inputDesc; - return SUCCESS; + EE ret = SUCCESS; +#ifdef _USE_CPU + if (tensorIsShape(inputDesc)) { + ret = power_cpu(inputDesc, inputDesc.dims + inputDesc.nDims, p, *outputDesc, + outputDesc->dims + outputDesc->nDims, arch); + } +#endif + return ret; } -EE power_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) +EE power_infer_output_size( + Tensor *inputTensor, PowerParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) { - if (inputTensor == nullptr) { - CHECK_STATUS(NULL_POINTER); - } - if (outputTensor == nullptr) { - CHECK_STATUS(NULL_POINTER); + if (inputTensor == nullptr || outputTensor == nullptr) { + return NULL_POINTER; } TensorDesc inputDesc = inputTensor->get_desc(); TensorDesc outputDesc = outputTensor->get_desc(); - CHECK_STATUS(power_infer_output_size_cpu(inputDesc, &outputDesc)); + EE ret = power_infer_output_size_cpu(inputDesc, p, &outputDesc, archInfo->arch); outputTensor->resize(outputDesc); - return SUCCESS; + return ret; } EE power(Tensor inputTensor, PowerParamSpec p, Tensor outputTensor, ArchInfo_t archInfo) @@ -50,7 +53,6 @@ EE power(Tensor inputTensor, PowerParamSpec p, Tensor outputTensor, ArchInfo_t a void *input = get_ptr_from_tensor(inputTensor, arch); TensorDesc outputDesc = outputTensor.get_desc(); void *output = get_ptr_from_tensor(outputTensor, arch); - EE ret = NOT_SUPPORTED; if (IS_CPU(arch)) { #ifdef _USE_CPU diff --git a/compute/tensor/src/preallocated_memory.cpp b/compute/tensor/src/preallocated_memory.cpp index 35f542cb..575ba51f 100644 --- a/compute/tensor/src/preallocated_memory.cpp +++ b/compute/tensor/src/preallocated_memory.cpp @@ -16,17 +16,32 @@ #include "gpu/mali/tensor_computing_mali.h" #endif -EE preallocated_memory_infer_output_size(Tensor *outputTensor, ArchInfo_t archInfo) +EE preallocated_memory_infer_output_size(std::vector inputTensors, + PreAllocatedMemoryParamSpec p, + Tensor *outputTensor, + ArchInfo_t archInfo) { if (outputTensor == nullptr) { CHECK_STATUS(NULL_POINTER); } - TensorDesc outputDesc = outputTensor->get_desc(); - outputTensor->resize(outputDesc); + TensorDesc outDesc = p.desc; + if (inputTensors.size() > 0) { + TensorDesc inDesc = inputTensors[0]->get_desc(); + if (outDesc.nDims == 0) { + outDesc = inDesc; + } else { + for (U32 i = 0; i < UNI_MIN(inDesc.nDims, outDesc.nDims); i++) { + if (outDesc.dims[outDesc.nDims - 1 - i] <= 0) { + outDesc.dims[outDesc.nDims - 1 - i] = inDesc.dims[inDesc.nDims - 1 - i]; + } + } + } + } + outputTensor->resize(outDesc); return SUCCESS; } -EE preallocated_memory(Tensor outputTensor, ArchInfo_t archInfo) +EE preallocated_memory(PreAllocatedMemoryParamSpec p, Tensor outputTensor, ArchInfo_t archInfo) { auto arch = archInfo->arch; TensorDesc outputDesc = outputTensor.get_desc(); @@ -40,7 +55,7 @@ EE preallocated_memory(Tensor outputTensor, ArchInfo_t archInfo) #endif #ifdef _USE_CPU } else { - memset(output, 0, tensorNumBytes(outputDesc)); + UNI_INIT(tensorNumElements(outputDesc), outputDesc.dt, p.value, output); ret = SUCCESS; #endif } diff --git a/compute/tensor/src/reduction.cpp b/compute/tensor/src/reduction.cpp index 74577aa8..56ded759 100644 --- a/compute/tensor/src/reduction.cpp +++ b/compute/tensor/src/reduction.cpp @@ -35,7 +35,6 @@ EE reduction(Tensor inputTensor, void *tmp = get_ptr_from_tensor(tmpTensor, arch); TensorDesc outputDesc = outputTensor.get_desc(); void *output = get_ptr_from_tensor(outputTensor, arch); - EE ret = NOT_SUPPORTED; if (IS_CPU(arch)) { #ifdef _USE_CPU @@ -55,31 +54,33 @@ EE reduction_infer_forward_tmp_bytes( Tensor inputTensor, ReductionParamSpec p, Tensor outputTensor, U32 *bytes, ArchInfo_t archInfo) { TensorDesc inputDesc = inputTensor.get_desc(); + EE ret = NOT_SUPPORTED; if (IS_GPU(archInfo->arch)) { #ifdef _USE_GPU TensorDesc outputDesc = outputTensor.get_desc(); GCLMemDesc gclmemInputDesc = ocl_get_desc(inputTensor); GCLMemDesc gclmemOutputDesc = ocl_get_desc(outputTensor); - CHECK_STATUS(reduction_infer_forward_tmp_bytes_mali( - inputDesc, p, outputDesc, gclmemInputDesc, gclmemOutputDesc, bytes)); - return SUCCESS; + ret = reduction_infer_forward_tmp_bytes_mali( + inputDesc, p, outputDesc, gclmemInputDesc, gclmemOutputDesc, bytes); #endif - } - int factor = 0; - if (p.axes_num > 1) { - factor = 2; - } - if (inputDesc.df == DF_NCHWC8 || inputDesc.df == DF_NCHWC16) { - for (int i = 0; i < p.axes_num; i++) { - // channel dimension - if (p.axes[i] == 1 || p.axes[i] == -3) { - factor = 2; - break; + } else { + int factor = 0; + if (p.num_axes > 1) { + factor = 2; + } + if (inputDesc.df == DF_NCHWC8 || inputDesc.df == DF_NCHWC16) { + for (int i = 0; i < p.num_axes; i++) { + // channel dimension + if (p.axes[i] == 1 || p.axes[i] == -3) { + factor = 2; + break; + } } } + *bytes = UNI_MAX(inputTensor.bytes(), outputTensor.bytes()) * factor; + ret = SUCCESS; } - *bytes = UNI_MAX(inputTensor.bytes(), outputTensor.bytes()) * factor; - return SUCCESS; + return ret; } EE reduction_infer_output_size(Tensor *inputTensor, @@ -88,28 +89,26 @@ EE reduction_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) { - if (inputTensor == nullptr) { - CHECK_STATUS(NULL_POINTER); - } - if (outputTensor == nullptr) { + if (inputTensor == nullptr || outputTensor == nullptr) { CHECK_STATUS(NULL_POINTER); } TensorDesc inputDesc = inputTensor->get_desc(); TensorDesc maskDesc = maskTensor.get_desc(); TensorDesc outputDesc = outputTensor->get_desc(); - if (IS_GPU(archInfo->arch)) { + Arch arch = archInfo->arch; + EE ret = NOT_SUPPORTED; + if (IS_GPU(arch)) { #ifdef _USE_GPU OclMemory *inputMem = (OclMemory *)inputTensor->get_memory(); OclMemory *outputMem = (OclMemory *)outputTensor->get_memory(); - CHECK_STATUS( - reduction_padding_input_mali(inputDesc, maskDesc, p, &outputDesc, inputMem, outputMem)); + ret = reduction_padding_input_mali(inputDesc, maskDesc, p, &outputDesc, inputMem, outputMem); #endif } else { int start = 0; TensorDesc tmpDesc = inputDesc; U32 cx = (inputDesc.df == DF_NCHWC8) ? 8 : 16; if (inputDesc.df == DF_NCHWC8 || inputDesc.df == DF_NCHWC16) { - for (int i = 0; i < p.axes_num; i++) { + for (int i = 0; i < p.num_axes; i++) { // channel dimension if (p.axes[i] == 1 || p.axes[i] == -3) { start = -1; @@ -124,10 +123,10 @@ EE reduction_infer_output_size(Tensor *inputTensor, tmpDesc.nDims += 1; } outputDesc = tmpDesc; - for (int i = start; i < p.axes_num; i++) { + for (int i = start; i < p.num_axes; i++) { int axis; if (i == -1) { - axis = 4; + axis = inputDesc.nDims; } else { axis = p.axes[i]; } @@ -179,7 +178,14 @@ EE reduction_infer_output_size(Tensor *inputTensor, } } } + ret = SUCCESS; + } +#ifdef _USE_CPU + if (tensorIsShape(inputDesc)) { + ret = reduction_cpu(inputDesc, inputDesc.dims + inputDesc.nDims, tensor0d(), nullptr, p, 0, + nullptr, outputDesc, outputDesc.dims + outputDesc.nDims, arch); } +#endif outputTensor->resize(outputDesc); - return SUCCESS; + return ret; } diff --git a/compute/tensor/src/reshape.cpp b/compute/tensor/src/reshape.cpp index 61b30c55..7a328ceb 100644 --- a/compute/tensor/src/reshape.cpp +++ b/compute/tensor/src/reshape.cpp @@ -56,7 +56,7 @@ EE reshape_infer_forward_tmp_bytes( inputDesc, outputDesc, gclmemInputDesc, gclmemOutputDesc, bytes); #endif } else { - *bytes = UNI_MAX(inputTensor.bytes(), outputTensor.bytes()); + *bytes = 0; ret = SUCCESS; } return ret; diff --git a/compute/tensor/src/rnn.cpp b/compute/tensor/src/rnn.cpp index 7d58dbe6..cf38ccd4 100644 --- a/compute/tensor/src/rnn.cpp +++ b/compute/tensor/src/rnn.cpp @@ -33,13 +33,13 @@ EE rnn_transform_filter(std::vector filterTensors, std::vector filters = get_data_from_tensors(filterTensors, arch); std::vector ftmDescs(ftmTensors.size()); std::vector ftms = get_data_from_tensor_ptrs(ftmTensors, arch); + std::vector scale(ftmTensors.size(), -1); EE ret = NOT_SUPPORTED; - if (IS_CPU(arch)) { #ifdef _USE_CPU ret = rnn_transform_filter_cpu(filterDescs.data(), (const void **)filters.data(), - rnnParamSpec, ftmDescs.data(), ftms.data(), arch); + rnnParamSpec, ftmDescs.data(), ftms.data(), scale.data(), arch); #endif #ifdef _USE_GPU } else if (IS_GPU(arch)) { @@ -59,6 +59,7 @@ EE rnn_transform_filter(std::vector filterTensors, } for (U32 i = 0; i < ftmTensors.size(); i++) { ftmTensors[i]->resize(ftmDescs[i]); + ftmTensors[i]->set_scale(scale[i]); } return ret; } @@ -103,14 +104,14 @@ EE rnn_infer_output_size(std::vector inputTensors, for (U32 i = 0; i < inputDesc.nDims - 3; ++i) { xDim *= inputDesc.dims[i]; } - U32 num = (rnnParamSpec.biDirection) ? 2 : 1; - U32 hDim = num * rnnParamSpec.numOutput; + U32 num = (rnnParamSpec.bi_direction) ? 2 : 1; + U32 hDim = num * rnnParamSpec.num_outputs; std::vector outputDescs; TensorDesc outputDesc = tensor3df(idt, DF_MTK, batch, step, hDim); outputDescs.push_back(outputDesc); - U32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection - : rnnParamSpec.numOutput; + U32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection + : rnnParamSpec.num_outputs; if (outputTensors.size() == 2) { if (rnnParamSpec.mode == RNN_LSTM) { outputDesc = tensor2df(idt, DF_NORMAL, batch, column + hDim); @@ -205,9 +206,13 @@ EE rnn(std::vector inputTensors, EE ret = NOT_SUPPORTED; if (IS_CPU(arch)) { #ifdef _USE_CPU + std::vector scale(filterTensors.size()); + for (U32 i = 0; i < filterTensors.size(); i++) { + scale[i] = filterTensors[i].get_scale(); + } ret = rnn_cpu(inputDescs[0], inputs[0], filterDescs.data(), (const void **)filters.data(), - biasDescs.data(), (const void **)biases.data(), rnnParamSpec, tmpBytes, tmp, - outputDescs[0], outputs[0], arch); + biasDescs.data(), (const void **)biases.data(), scale.data(), rnnParamSpec, tmpBytes, + tmp, outputDescs[0], outputs[0], arch); #endif } else if (IS_GPU(archInfo->arch)) { #ifdef _USE_GPU @@ -260,7 +265,7 @@ EE rnncell_infer_output_size(std::vector inputTensor, DataFormat idf; U32 batch, xDim; CHECK_STATUS(tensor2dGet(inputDesc, &idt, &idf, &batch, &xDim)); - U32 hDim = rnnParamSpec.numOutput; + U32 hDim = rnnParamSpec.num_outputs; outputDesc = tensor2df(idt, idf, batch, hDim); if (IS_GPU(arch)) { #ifdef _USE_GPU @@ -339,6 +344,7 @@ EE rnncell_transform_filter(std::vector filterTensors, std::vector filters = get_data_from_tensors(filterTensors, arch); std::vector ftmDescs(ftmTensors.size()); std::vector ftms = get_data_from_tensor_ptrs(ftmTensors, arch); + std::vector scale(ftmTensors.size(), -1); EE ret = NOT_SUPPORTED; if (IS_GPU(arch)) { @@ -347,7 +353,7 @@ EE rnncell_transform_filter(std::vector filterTensors, GCLMem filterTranArray[2]; filterArray[0] = *((GCLMem_t)filters[0]); filterTranArray[0] = *((GCLMem_t)ftms[0]); - if (rnnParamSpec.numProjection > 0) { + if (rnnParamSpec.num_projection > 0) { filterArray[1] = *((GCLMem_t)filters[1]); filterTranArray[1] = *((GCLMem_t)ftms[1]); } @@ -358,6 +364,7 @@ EE rnncell_transform_filter(std::vector filterTensors, } for (U32 i = 0; i < ftmTensors.size(); i++) { ftmTensors[i]->resize(ftmDescs[i]); + ftmTensors[i]->set_scale(scale[i]); } return ret; } @@ -407,9 +414,13 @@ EE rnncell(Tensor xTensor, EE ret = NOT_SUPPORTED; if (IS_CPU(arch)) { #ifdef _USE_CPU + std::vector scale(filterTensors.size()); + for (U32 i = 0; i < filterTensors.size(); i++) { + scale[i] = filterTensors[i].get_scale(); + } ret = rnncell_cpu(xDesc, currentX, filterDescs.data(), (const void **)filters.data(), - biasDescs.data(), (const void **)biases.data(), state, rnnParamSpec, batchStrideX, - batchStrideH, tmpBytes, tmp, hDesc, currentH, archInfo->arch); + biasDescs.data(), (const void **)biases.data(), scale.data(), state, rnnParamSpec, + batchStrideX, batchStrideH, tmpBytes, tmp, hDesc, currentH, archInfo->arch); #endif #ifdef _USE_GPU } else if (IS_GPU(arch)) { @@ -417,7 +428,7 @@ EE rnncell(Tensor xTensor, GCLMem biasArray[2]; filterArray[0] = *((GCLMem_t)filters[0]); biasArray[0] = *((GCLMem_t)biases[0]); - if (rnnParamSpec.numProjection > 0) { + if (rnnParamSpec.num_projection > 0) { filterArray[1] = *((GCLMem_t)filters[1]); //biasArray[1] = *((GCLMem_t)biases[1]);currently only init one bias } diff --git a/compute/tensor/src/roialign.cpp b/compute/tensor/src/roialign.cpp index af9711fe..dafc68c3 100644 --- a/compute/tensor/src/roialign.cpp +++ b/compute/tensor/src/roialign.cpp @@ -85,6 +85,8 @@ EE roialign_infer_forward_tmp_bytes( CHECK_STATUS( roialign_infer_forward_tmp_bytes_mali(inputDesc, gclmemInputDesc, outputDesc, bytes)); #endif + } else { + *bytes = 0; } return SUCCESS; } @@ -98,7 +100,6 @@ EE roialign(std::vector inputTensor, auto arch = archInfo->arch; std::vector inputDesc = get_desc_from_tensors(inputTensor); std::vector input = get_data_from_tensors(inputTensor, arch); - void *tmpbuf = get_ptr_from_tensor(tmpTensor, arch); TensorDesc outputDesc = outputTensor.get_desc(); void *output = get_ptr_from_tensor(outputTensor, arch); EE ret = NOT_SUPPORTED; @@ -108,6 +109,7 @@ EE roialign(std::vector inputTensor, #endif } else if (IS_GPU(arch)) { #ifdef _USE_GPU + void *tmpbuf = get_ptr_from_tensor(tmpTensor, arch); ret = roialign_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, input, p, (GCLMem_t)tmpbuf, outputDesc, (GCLMem_t)output); #endif diff --git a/compute/tensor/src/scale.cpp b/compute/tensor/src/scale.cpp index 94a02828..1c02eba5 100644 --- a/compute/tensor/src/scale.cpp +++ b/compute/tensor/src/scale.cpp @@ -12,14 +12,8 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "tensor_computing.h" -#ifdef _USE_GENERAL -#include "cpu/general/tensor_computing_general.h" -#endif -#ifdef _USE_X86 -#include "cpu/x86/tensor_computing_x86.h" -#endif -#ifdef _USE_NEON -#include "cpu/arm/tensor_computing_arm.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" #endif #ifdef _USE_GPU #include "gpu/mali/tensor_computing_mali.h" @@ -83,17 +77,9 @@ EE scale(Tensor inputTensor, void *output = get_ptr_from_tensor(outputTensor, arch); EE ret = NOT_SUPPORTED; - if (IS_GENERAL(arch)) { -#ifdef _USE_GENERAL - ret = scale_general(inputDesc, input, alpha, beta, p, outputDesc, output); -#endif -#ifdef _USE_X86 - } else if (IS_X86(arch)) { - ret = scale_x86(inputDesc, input, alpha, beta, p, outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (IS_ARM(arch)) { - ret = scale_arm(inputDesc, input, alpha, beta, p, outputDesc, output); + if (IS_CPU(arch)) { +#ifdef _USE_CPU + ret = scale_cpu(inputDesc, input, alpha, beta, p, outputDesc, output, arch); #endif #ifdef _USE_GPU } else if (IS_GPU(arch)) { diff --git a/compute/tensor/src/slice.cpp b/compute/tensor/src/slice.cpp index a7764c7f..2b218dcc 100644 --- a/compute/tensor/src/slice.cpp +++ b/compute/tensor/src/slice.cpp @@ -20,12 +20,9 @@ #endif inline EE slice_infer_output_size_cpu( - TensorDesc inputDesc, SliceParamSpec p, std::vector *outputDesc) + TensorDesc inputDesc, SliceParamSpec p, std::vector& outputDesc) { - if (nullptr == outputDesc) { - CHECK_STATUS(NULL_POINTER); - } - U32 num = (*outputDesc).size(); + U32 num = outputDesc.size(); int axis = (p.axis + inputDesc.nDims) % inputDesc.nDims; I32 *slice_points = p.slice_points; @@ -37,12 +34,13 @@ inline EE slice_infer_output_size_cpu( } } I32 target_axis = inputDesc.nDims - 1 - axis; + I32 cDim = (I32)inputDesc.nDims - 2; if (splitEqual) { CHECK_REQUIREMENT(0 == inputDesc.dims[target_axis] % num); inputDesc.dims[target_axis] /= num; } for (U32 i = 0; i < num; i++) { - (*outputDesc)[i] = inputDesc; + outputDesc[i] = inputDesc; if (splitEqual) { continue; } @@ -55,7 +53,7 @@ inline EE slice_infer_output_size_cpu( if (i < num - 1) { next_point = slice_points[i]; } - if (i == 0 && num == 1 && p.slice_size == 1) { // Could happen in onnx + if (i == 0 && num == 1 && p.num_slice == 1) { // Could happen in onnx next_point = slice_points[0]; } if (prev_point < 0) { @@ -70,20 +68,45 @@ inline EE slice_infer_output_size_cpu( next_point = 0; } } - (*outputDesc)[i].dims[target_axis] = next_point - prev_point; + outputDesc[i].dims[target_axis] = next_point - prev_point; + } + + for (U32 i = 0; i < num; i++) { + if ((cDim >= 0) && (outputDesc[i].dims[cDim] % 8 != 0)) { + if (outputDesc[i].nDims >= 4) { + outputDesc[i].df = DF_NCHW; + } else if (outputDesc[i].nDims == 3) { + outputDesc[i].df = DF_MTK; + } else if (outputDesc[i].nDims == 2) { + outputDesc[i].df = DF_NORMAL; + } else { + return NOT_SUPPORTED; + } + } } - return SUCCESS; + + EE ret = SUCCESS; +#ifdef _USE_CPU + if (tensorIsShape(inputDesc)) { + std::vector output(num); + for (U32 i = 0; i < num; i++) { + output[i] = outputDesc[i].dims + outputDesc[i].nDims; + } + ret = slice_cpu(inputDesc, inputDesc.dims + inputDesc.nDims, p, outputDesc, output); + } +#endif + return ret; } EE slice_infer_output_size( Tensor *inputTensor, SliceParamSpec p, std::vector outputTensor, ArchInfo_t archInfo) { if (inputTensor == nullptr) { - CHECK_STATUS(NULL_POINTER); + return NULL_POINTER; } TensorDesc inputDesc = inputTensor->get_desc(); std::vector outputDesc = get_desc_from_tensor_ptrs(outputTensor); - CHECK_STATUS(slice_infer_output_size_cpu(inputDesc, p, &outputDesc)); + EE ret = slice_infer_output_size_cpu(inputDesc, p, outputDesc); if (IS_GPU(archInfo->arch)) { #ifdef _USE_GPU OclMemory *inputMem = (OclMemory *)inputTensor->get_memory(); @@ -91,13 +114,13 @@ EE slice_infer_output_size( for (U32 i = 0; i < outputTensor.size(); i++) { outputMems.push_back((OclMemory *)outputTensor[i]->get_memory()); } - CHECK_STATUS(slice_padding_input_mali(inputDesc, p, &outputDesc, inputMem, outputMems)); + ret = slice_padding_input_mali(inputDesc, p, &outputDesc, inputMem, outputMems); #endif } for (U32 i = 0; i < outputTensor.size(); i++) { outputTensor[i]->resize(outputDesc[i]); } - return SUCCESS; + return ret; } EE slice_infer_forward_tmp_bytes(Tensor inputTensor, @@ -139,7 +162,7 @@ EE slice(Tensor inputTensor, EE ret = NOT_SUPPORTED; if (IS_CPU(arch)) { #ifdef _USE_CPU - ret = slice_cpu(inputDesc, input, p, outputDesc, &output); + ret = slice_cpu(inputDesc, input, p, outputDesc, output); #endif #ifdef _USE_GPU } else if (IS_GPU(arch)) { diff --git a/compute/tensor/src/softmax.cpp b/compute/tensor/src/softmax.cpp index cc173c1b..ac2b02b8 100644 --- a/compute/tensor/src/softmax.cpp +++ b/compute/tensor/src/softmax.cpp @@ -25,37 +25,6 @@ #include "gpu/mali/tensor_computing_mali.h" #endif -EE softmax( - Tensor inputTensor, SoftmaxParamSpec p, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo) -{ - auto arch = archInfo->arch; - TensorDesc inputDesc = inputTensor.get_desc(); - void *input = get_ptr_from_tensor(inputTensor, arch); - TensorDesc outputDesc = outputTensor.get_desc(); - void *output = get_ptr_from_tensor(outputTensor, arch); - EE ret = NOT_SUPPORTED; - if (IS_GENERAL(arch)) { -#ifdef _USE_GENERAL - ret = softmax_general(inputDesc, input, p, outputDesc, output); -#endif -#ifdef _USE_X86 - } else if (IS_X86(arch)) { - ret = softmax_x86(inputDesc, input, p, outputDesc, output); -#endif -#ifdef _USE_NEON - } else if (IS_ARM(arch)) { - ret = softmax_arm(inputDesc, input, p, outputDesc, output); -#endif -#ifdef _USE_GPU - } else if (IS_GPU(arch)) { - void *tmp = get_ptr_from_tensor(tmpTensor, arch); - ret = softmax_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, - p, (GCLMem_t)tmp, outputDesc, (GCLMem_t)output); -#endif - } - return ret; -} - EE softmax_infer_output_size( Tensor *inputTensor, SoftmaxParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) { @@ -102,3 +71,59 @@ EE softmax_infer_forward_tmp_bytes( } return ret; } + +EE softmax( + Tensor inputTensor, SoftmaxParamSpec p, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = softmax_general(inputDesc, input, p, outputDesc, output); +#endif +#ifdef _USE_X86 + } else if (IS_X86(arch)) { + ret = softmax_x86(inputDesc, input, p, outputDesc, output); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = softmax_arm(inputDesc, input, p, outputDesc, output); +#endif +#ifdef _USE_GPU + } else if (IS_GPU(arch)) { + void *tmp = get_ptr_from_tensor(tmpTensor, arch); + ret = softmax_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, (GCLMem_t)input, + p, (GCLMem_t)tmp, outputDesc, (GCLMem_t)output); +#endif + } + return ret; +} + +EE logsoftmax( + Tensor inputTensor, SoftmaxParamSpec p, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo) +{ + auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); + EE ret = NOT_SUPPORTED; + if (IS_GENERAL(arch)) { +#ifdef _USE_GENERAL + ret = logsoftmax_general(inputDesc, input, p, outputDesc, output); +#endif +#ifdef _USE_X86 + } else if (IS_X86(arch)) { + ret = logsoftmax_x86(inputDesc, input, p, outputDesc, output); +#endif +#ifdef _USE_NEON + } else if (IS_ARM(arch)) { + ret = logsoftmax_arm(inputDesc, input, p, outputDesc, output); +#endif + } + return ret; +} diff --git a/compute/tensor/src/space2depth.cpp b/compute/tensor/src/space2depth.cpp index 7c6b9cb5..6c8d6efc 100644 --- a/compute/tensor/src/space2depth.cpp +++ b/compute/tensor/src/space2depth.cpp @@ -12,14 +12,15 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "tensor_computing.h" +#ifdef _USE_CPU +#include "cpu/tensor_computing_cpu.h" +#endif #ifdef _USE_GPU #include "gpu/mali/tensor_computing_mali.h" #endif -EE space2depth_infer_output_size(Tensor *inputTensor, - Space2DepthParamSpec space2DepthPara, - Tensor *outputTensor, - ArchInfo_t archInfo) +EE space2depth_infer_output_size( + Tensor *inputTensor, Space2DepthParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) { if (inputTensor == nullptr) { CHECK_STATUS(NULL_POINTER); @@ -29,35 +30,42 @@ EE space2depth_infer_output_size(Tensor *inputTensor, } auto arch = archInfo->arch; TensorDesc inputDesc = inputTensor->get_desc(); - TensorDesc outputDesc = outputTensor->get_desc(); + TensorDesc outputDesc = inputDesc; + EE ret = NOT_SUPPORTED; if (IS_GPU(arch)) { #ifdef _USE_GPU OclMemory *inputMem = (OclMemory *)inputTensor->get_memory(); OclMemory *outputMem = (OclMemory *)outputTensor->get_memory(); - CHECK_STATUS(space2depth_padding_input_mali( - inputDesc, space2DepthPara, &outputDesc, inputMem, outputMem)); + ret = space2depth_padding_input_mali(inputDesc, p, &outputDesc, inputMem, outputMem); #endif + } else { + for (int i = 0; i < (int)outputDesc.nDims - 2; i++) { + outputDesc.dims[i] /= p.block_size; + outputDesc.dims[outputDesc.nDims - 2] *= p.block_size; + } + outputDesc.df = getTensorDefaultDataFormat(outputDesc.nDims); + ret = SUCCESS; } outputTensor->resize(outputDesc); - return SUCCESS; + return ret; } -EE space2depth(Tensor inputTensor, - Space2DepthParamSpec space2DepthPara, - Tensor outputTensor, - ArchInfo_t archInfo) +EE space2depth(Tensor inputTensor, Space2DepthParamSpec p, Tensor outputTensor, ArchInfo_t archInfo) { auto arch = archInfo->arch; + TensorDesc inputDesc = inputTensor.get_desc(); + void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); + void *output = get_ptr_from_tensor(outputTensor, arch); EE ret = NOT_SUPPORTED; if (IS_GPU(arch)) { #ifdef _USE_GPU - TensorDesc inputDesc = inputTensor.get_desc(); - void *input = get_ptr_from_tensor(inputTensor, arch); - TensorDesc outputDesc = outputTensor.get_desc(); - void *output = get_ptr_from_tensor(outputTensor, arch); - ret = space2depth_mali(((MaliPara_t)(archInfo->archPara))->handle, inputDesc, - (GCLMem_t)input, space2DepthPara, outputDesc, (GCLMem_t)output); + (GCLMem_t)input, p, outputDesc, (GCLMem_t)output); +#endif + } else { +#ifdef _USE_CPU + ret = space2depth_cpu(inputDesc, input, p, outputDesc, output); #endif } return ret; diff --git a/compute/tensor/src/squeeze.cpp b/compute/tensor/src/squeeze.cpp index d9d2c264..d08b1431 100644 --- a/compute/tensor/src/squeeze.cpp +++ b/compute/tensor/src/squeeze.cpp @@ -15,13 +15,13 @@ #ifdef _USE_GPU #include "gpu/mali/tensor_computing_mali.h" #endif -#include EE squeeze(Tensor inputTensor, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo) { auto arch = archInfo->arch; TensorDesc inputDesc = inputTensor.get_desc(); void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); void *output = get_ptr_from_tensor(outputTensor, arch); EE ret = NOT_SUPPORTED; @@ -34,8 +34,13 @@ EE squeeze(Tensor inputTensor, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t #endif #ifdef _USE_CPU } else { - if (output != input) { - memcpy(output, input, tensorNumBytes(inputDesc)); + if ((inputDesc.df == DF_NCHWC8 || inputDesc.df == DF_NCHWC16) && + inputDesc.df != outputDesc.df) { + TensorDesc nchwDesc = inputDesc; + nchwDesc.df = DF_NCHW; + transformToNCHW(inputDesc, input, nchwDesc, output); + } else { + UNI_MEMCPY(output, input, tensorNumBytes(inputDesc)); } ret = SUCCESS; #endif @@ -47,25 +52,41 @@ EE squeeze_infer_output_size_cpu( TensorDesc inputDesc, int *axes, int axesNum, TensorDesc *outputDesc) { *outputDesc = inputDesc; + if ((int)inputDesc.nDims == axesNum) { + outputDesc->nDims = 1; + outputDesc->df = DF_SCALAR; + return SUCCESS; + } for (int i = 0; i < axesNum; i++) { int axis = axes[i]; if (axis < 0) { axis += inputDesc.nDims; } - outputDesc->dims[inputDesc.nDims - 1 - axis] = 0; + if (outputDesc->dims[inputDesc.nDims - 1 - axis] != 1) { + UNI_ERROR_LOG( + "try to squeeze non-one dimension in (%s).\n", tensorDesc2Str(inputDesc).c_str()); + } + outputDesc->dims[inputDesc.nDims - 1 - axis] = INT_MAX; } U32 index = 0; for (U32 i = 0; i < inputDesc.nDims; i++) { - if (outputDesc->dims[i] != 0) { + if (outputDesc->dims[i] != INT_MAX) { outputDesc->dims[index++] = outputDesc->dims[i]; } } CHECK_REQUIREMENT(index + axesNum == inputDesc.nDims); outputDesc->nDims = index; - if (inputDesc.df != DF_NCHWC8) { - outputDesc->df = getTensorDefaultDataFormat(outputDesc->nDims); - } else { - outputDesc->df = DF_NCHWC8; + outputDesc->df = getTensorDefaultDataFormat(outputDesc->nDims); + if (inputDesc.df == DF_NCHWC8 || inputDesc.df == DF_NCHWC16) { + bool changeChannelAxis = false; + for (int i = 0; i < axesNum; i++) { + if (axes[i] < 1) { + changeChannelAxis = true; + } + } + if (!changeChannelAxis) { + outputDesc->df = inputDesc.df; + } } return SUCCESS; } @@ -81,7 +102,7 @@ EE squeeze_infer_output_size( } TensorDesc inputDesc = inputTensor->get_desc(); TensorDesc outputDesc = outputTensor->get_desc(); - CHECK_STATUS(squeeze_infer_output_size_cpu(inputDesc, p.axes, p.axes_num, &outputDesc)); + CHECK_STATUS(squeeze_infer_output_size_cpu(inputDesc, p.axes, p.num_axes, &outputDesc)); outputTensor->resize(outputDesc); return SUCCESS; } diff --git a/compute/tensor/src/tfslice.cpp b/compute/tensor/src/tfslice.cpp index b6c0a824..f477a3c5 100644 --- a/compute/tensor/src/tfslice.cpp +++ b/compute/tensor/src/tfslice.cpp @@ -22,10 +22,7 @@ EE tfslice_infer_output_size( Tensor *inputTensor, TfSliceParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) { - if (inputTensor == nullptr) { - CHECK_STATUS(NULL_POINTER); - } - if (outputTensor == nullptr) { + if (inputTensor == nullptr || outputTensor == nullptr) { CHECK_STATUS(NULL_POINTER); } TensorDesc inputDesc = inputTensor->get_desc(); @@ -38,8 +35,15 @@ EE tfslice_infer_output_size( } #endif } + EE ret = SUCCESS; +#ifdef _USE_CPU + if (tensorIsShape(inputDesc)) { + ret = tfslice_cpu(inputDesc, inputDesc.dims + inputDesc.nDims, p, outputDesc, + outputDesc.dims + outputDesc.nDims); + } +#endif outputTensor->resize(outputDesc); - return SUCCESS; + return ret; } EE tfslice_infer_forward_tmp_bytes( diff --git a/compute/tensor/src/tile.cpp b/compute/tensor/src/tile.cpp index cce7feff..94ce1b87 100644 --- a/compute/tensor/src/tile.cpp +++ b/compute/tensor/src/tile.cpp @@ -23,15 +23,15 @@ EE tile_infer_output_size( auto inDim = inputTensor->get_desc(); auto outDim = inDim; - if ((int)inDim.nDims == tileParamSpec.dimsSize) { - for (int i = 0; i < tileParamSpec.dimsSize; i++) { - outDim.dims[tileParamSpec.dimsSize - 1 - i] = - inDim.dims[tileParamSpec.dimsSize - 1 - i] * tileParamSpec.repeatsInfo[i]; + if ((int)inDim.nDims == tileParamSpec.num_repeats) { + for (int i = 0; i < tileParamSpec.num_repeats; i++) { + outDim.dims[tileParamSpec.num_repeats - 1 - i] = + inDim.dims[tileParamSpec.num_repeats - 1 - i] * tileParamSpec.repeats[i]; } } else { int axis = (tileParamSpec.axis >= 0) ? tileParamSpec.axis : tileParamSpec.axis + inDim.nDims; axis = inDim.nDims - 1 - axis; - outDim.dims[axis] = outDim.dims[axis] * tileParamSpec.repeatsInfo[0]; + outDim.dims[axis] = outDim.dims[axis] * tileParamSpec.repeats[0]; } if (IS_GPU(archInfo->arch)) { #ifdef _USE_GPU @@ -89,25 +89,25 @@ EE tile(Tensor inputTensor, outputDesc.dims[0] *= 8; } - if (tileParamSpec.dimsSize != (int)inputDesc.nDims) { - CHECK_REQUIREMENT(tileParamSpec.dimsSize == 1); + if (tileParamSpec.num_repeats != (int)inputDesc.nDims) { + CHECK_REQUIREMENT(tileParamSpec.num_repeats == 1); int axis = (tileParamSpec.axis >= 0) ? tileParamSpec.axis : tileParamSpec.axis + inputDesc.nDims; - U32 tiles = tileParamSpec.repeatsInfo[0]; + U32 tiles = tileParamSpec.repeats[0]; for (int i = 0; i < (int)inputDesc.nDims; ++i) { - tileParamSpec.repeatsInfo[i] = 1; + tileParamSpec.repeats[i] = 1; if (axis == i) { - tileParamSpec.repeatsInfo[i] = tiles; + tileParamSpec.repeats[i] = tiles; } } } U32 repeat_num = 0; for (U32 i = 0; i < inputDesc.nDims; ++i) { - repeat_num += (tileParamSpec.repeatsInfo[inputDesc.nDims - 1 - i] > 1); + repeat_num += (tileParamSpec.repeats[inputDesc.nDims - 1 - i] > 1); } if (repeat_num == 0) { - memcpy(output, input, tensorNumBytes(inputDesc)); + UNI_MEMCPY(output, input, tensorNumBytes(inputDesc)); return SUCCESS; } @@ -122,14 +122,14 @@ EE tile(Tensor inputTensor, bool first_copy = true; for (U32 j = 0; j < inputDesc.nDims; ++j) { - if (tileParamSpec.repeatsInfo[inputDesc.nDims - 1 - j] > 1) { - U32 tiles = tileParamSpec.repeatsInfo[inputDesc.nDims - 1 - j]; + if (tileParamSpec.repeats[inputDesc.nDims - 1 - j] > 1) { + U32 tiles = tileParamSpec.repeats[inputDesc.nDims - 1 - j]; int loopOuter = itile_size[inputDesc.nDims - 1] / itile_size[j]; if (first_copy) { first_copy = false; for (int i = 0; i < loopOuter; ++i) { for (U32 ii = 0; ii < tiles; ++ii) { - memcpy(output_ptr + i * tiles * itile_size[j] + ii * itile_size[j], + UNI_MEMCPY(output_ptr + i * tiles * itile_size[j] + ii * itile_size[j], input_ptr + i * itile_size[j], itile_size[j]); } } @@ -138,7 +138,7 @@ EE tile(Tensor inputTensor, for (U32 ii = 0; ii < tiles; ++ii) { if (i != 0 || ii != 0) { U32 copy_size = otile_size[j - 1] * inputDesc.dims[i]; - memcpy(output_ptr + i * tiles * copy_size + ii * copy_size, + UNI_MEMCPY(output_ptr + i * tiles * copy_size + ii * copy_size, output_ptr + i * copy_size, copy_size); } } diff --git a/compute/tensor/src/topk.cpp b/compute/tensor/src/topk.cpp index 1a9dc4b2..71a15d69 100644 --- a/compute/tensor/src/topk.cpp +++ b/compute/tensor/src/topk.cpp @@ -86,8 +86,10 @@ EE topk_infer_output_size(Tensor *inputTensor, outputDesc = inputDesc; outputIndicesDesc = inputDesc; int axis = inputDesc.nDims - 1 - (p.axis + inputDesc.nDims) % inputDesc.nDims; - outputDesc.dims[axis] = p.topk; - outputIndicesDesc.dims[axis] = p.topk; + if (p.k > 0) { + outputDesc.dims[axis] = p.k; + outputIndicesDesc.dims[axis] = p.k; + } outputIndicesDesc.dt = DT_I32; outputTensor->resize(outputDesc); outputIndicesTensor->resize(outputIndicesDesc); diff --git a/compute/tensor/src/transpose.cpp b/compute/tensor/src/transpose.cpp index 1f9444cd..0b848e58 100644 --- a/compute/tensor/src/transpose.cpp +++ b/compute/tensor/src/transpose.cpp @@ -34,13 +34,13 @@ EE transpose(Tensor inputTensor, void *input = get_ptr_from_tensor(inputTensor, arch); TensorDesc outputDesc = outputTensor.get_desc(); void *output = get_ptr_from_tensor(outputTensor, arch); - std::vector tmpDims(p.trans_dims, p.trans_dims + p.trans_size); + std::vector tmpDims(p.axes, p.axes + p.num_axes); if (IS_CPU(arch)) { // Keep transDims unchanged so that input resize does not lead to error - if (inputDesc.nDims == 4 && p.trans_size == 3 && inputDesc.dims[0] == 1) { - inputDesc = tensor3df(inputDesc.dt, inputDesc.df, inputDesc.dims[3], inputDesc.dims[2], - inputDesc.dims[1]); - } + //if (inputDesc.nDims == 4 && p.num_axes == 3 && inputDesc.dims[0] == 1) { + // inputDesc = tensor3df(inputDesc.dt, inputDesc.df, inputDesc.dims[3], inputDesc.dims[2], + // inputDesc.dims[1]); + //} if (DF_NCHWC8 == inputDesc.df || DF_NCHWC16 == inputDesc.df) { U32 cx = 8; @@ -48,7 +48,7 @@ EE transpose(Tensor inputTensor, cx = 16; CHECK_REQUIREMENT(inputDesc.dims[inputDesc.nDims - 2] % 16 == 0); } - if (inputDesc.nDims == p.trans_size) { + if (inputDesc.nDims == p.num_axes) { auto ptr = std::find(tmpDims.begin(), tmpDims.end(), 1); tmpDims.insert(ptr + 1, inputDesc.nDims); } @@ -75,6 +75,22 @@ EE transpose(Tensor inputTensor, } outputDesc = desc; } + if (outputDesc.df == DF_NCHWC8) { + int icaxis = inputDesc.nDims - 1 - p.axes[1]; + for (int i = inputDesc.nDims; i > icaxis; i--) { + inputDesc.dims[i] = inputDesc.dims[i - 1]; + } + inputDesc.nDims++; + inputDesc.dims[icaxis] = 8; + inputDesc.dims[icaxis + 1] /= 8; + for (int i = outputDesc.nDims; i > 0; i--) { + outputDesc.dims[i] = outputDesc.dims[i - 1]; + } + outputDesc.nDims++; + outputDesc.dims[0] = 8; + outputDesc.dims[outputDesc.nDims - 2] /= 8; + tmpDims.push_back(tmpDims.size()); + } } EE ret = NOT_SUPPORTED; if (IS_GENERAL(arch)) { @@ -102,32 +118,31 @@ inline EE transpose_infer_output_size_cpu( CHECK_STATUS(NULL_POINTER); } - U32 *dim = p.trans_dims; + U32 *dim = p.axes; *outputDesc = inputDesc; - U32 inputDim = inputDesc.nDims; - if (4 == inputDim) { - (*outputDesc).df = DF_NCHW; - } - U32 outputDim = (*outputDesc).nDims; + U32 num = inputDesc.nDims; U32 index = 0; - for (U32 i = 0; i < p.trans_size; i++) { + for (U32 i = 0; i < p.num_axes; i++) { // use 5-dim array to transpose a NCHWC8 tensor. skip c8 axis - if (dim[i] >= inputDim) { + if (dim[i] >= num) { continue; } // NOTE: TensorDesc.dims array is in [W H C N] order. // so if you want to transpose [N C H W] format data, we use (dims - 1 - *) // [5 6 7 8] + [0 3 2 1] = [5 8 7 6] // [8 7 6 5] + [0 3 2 1] = [6 7 8 5] - (*outputDesc).dims[outputDim - 1 - index] = inputDesc.dims[inputDim - 1 - dim[i]]; + outputDesc->dims[num - 1 - index] = inputDesc.dims[num - 1 - dim[i]]; index++; } - if (outputDesc->nDims >= 4 || inputDesc.df == DF_NCHWC8) { + if (inputDesc.df == DF_NCHWC8) { outputDesc->df = DF_NCHW; } - if ((*outputDesc).nDims == 4 && p.trans_size == 3 && (*outputDesc).dims[0] == 1) { - (*outputDesc) = tensor3df(inputDesc.dt, DF_NCHW, (*outputDesc).dims[3], - (*outputDesc).dims[2], (*outputDesc).dims[1]); + //if (outputDesc->nDims == 4 && p.num_axes == 3 && outputDesc->dims[0] == 1) { + // (*outputDesc) = tensor3df(inputDesc.dt, DF_NCHW, outputDesc->dims[3], + // outputDesc->dims[2], outputDesc->dims[1]); + //} + if (p.df == DF_NCHWC8 && outputDesc->dims[num - 2] % 8 == 0) { + outputDesc->df = DF_NCHWC8; } return SUCCESS; } diff --git a/compute/tensor/src/unsqueeze.cpp b/compute/tensor/src/unsqueeze.cpp index 289d3086..27ac3d1a 100644 --- a/compute/tensor/src/unsqueeze.cpp +++ b/compute/tensor/src/unsqueeze.cpp @@ -15,13 +15,13 @@ #ifdef _USE_GPU #include "gpu/mali/tensor_computing_mali.h" #endif -#include EE unsqueeze(Tensor inputTensor, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo) { auto arch = archInfo->arch; TensorDesc inputDesc = inputTensor.get_desc(); void *input = get_ptr_from_tensor(inputTensor, arch); + TensorDesc outputDesc = outputTensor.get_desc(); void *output = get_ptr_from_tensor(outputTensor, arch); EE ret = NOT_SUPPORTED; @@ -34,8 +34,13 @@ EE unsqueeze(Tensor inputTensor, Tensor tmpTensor, Tensor outputTensor, ArchInfo #endif #ifdef _USE_CPU } else { - if (output != input) { - memcpy(output, input, tensorNumBytes(inputDesc)); + if ((inputDesc.df == DF_NCHWC8 || inputDesc.df == DF_NCHWC16) && + inputDesc.df != outputDesc.df) { + TensorDesc nchwDesc = inputDesc; + nchwDesc.df = DF_NCHW; + transformToNCHW(inputDesc, input, nchwDesc, output); + } else { + UNI_MEMCPY(output, input, tensorNumBytes(inputDesc)); } ret = SUCCESS; #endif @@ -47,11 +52,22 @@ EE unsqueeze_infer_output_size_cpu( TensorDesc inputDesc, int *axes, int axesNum, TensorDesc *outputDesc) { outputDesc->dt = inputDesc.dt; - outputDesc->nDims = inputDesc.nDims + axesNum; - if (inputDesc.df != DF_NCHWC8) { - outputDesc->df = getTensorDefaultDataFormat(outputDesc->nDims); + if (inputDesc.df == DF_SCALAR) { + outputDesc->nDims = axesNum; } else { - outputDesc->df = DF_NCHWC8; + outputDesc->nDims = inputDesc.nDims + axesNum; + } + outputDesc->df = getTensorDefaultDataFormat(outputDesc->nDims); + if (inputDesc.df == DF_NCHWC8 || inputDesc.df == DF_NCHWC16) { + bool changeChannelAxis = false; + for (int i = 0; i < axesNum; i++) { + if (axes[i] <= 1) { + changeChannelAxis = true; + } + } + if (!changeChannelAxis) { + outputDesc->df = inputDesc.df; + } } for (U32 i = 0; i < outputDesc->nDims; i++) { outputDesc->dims[i] = 0; @@ -69,22 +85,28 @@ EE unsqueeze_infer_output_size_cpu( outputDesc->dims[i] = inputDesc.dims[index++]; } } - CHECK_REQUIREMENT(index == inputDesc.nDims); + if (inputDesc.df != DF_SCALAR) { + CHECK_REQUIREMENT(index == inputDesc.nDims); + } +#ifdef _USE_CPU + if (tensorIsShape(inputDesc)) { + for (U32 i = 0; outputDesc->nDims + i < DIM_LEN; i++) { + outputDesc->dims[outputDesc->nDims + i] = inputDesc.dims[inputDesc.nDims + i]; + } + } +#endif return SUCCESS; } EE unsqueeze_infer_output_size( Tensor *inputTensor, UnsqueezeParamSpec p, Tensor *outputTensor, ArchInfo_t archInfo) { - if (inputTensor == nullptr) { - CHECK_STATUS(NULL_POINTER); - } - if (outputTensor == nullptr) { - CHECK_STATUS(NULL_POINTER); + if (inputTensor == nullptr || outputTensor == nullptr) { + return NULL_POINTER; } TensorDesc inputDesc = inputTensor->get_desc(); TensorDesc outputDesc = outputTensor->get_desc(); - EE ret = unsqueeze_infer_output_size_cpu(inputDesc, p.axes, p.axes_num, &outputDesc); + EE ret = unsqueeze_infer_output_size_cpu(inputDesc, p.axes, p.num_axes, &outputDesc); outputTensor->resize(outputDesc); return ret; } diff --git a/compute/tensor/src/where.cpp b/compute/tensor/src/where.cpp index 7eec13bc..e0a031f7 100644 --- a/compute/tensor/src/where.cpp +++ b/compute/tensor/src/where.cpp @@ -10,129 +10,153 @@ // WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include "tensor_computing.h" -EE where_infer_output_size(Tensor *inputTensor, Tensor *outputTensor, ArchInfo_t archInfo) -{ - auto inDesc = inputTensor->get_desc(); - auto outDesc = inDesc; - outputTensor->resize(outDesc); - return SUCCESS; -} +#include "tensor_computing.h" -bool tensorDescEqual(TensorDesc a, TensorDesc b) +EE where_infer_output_size( + Tensor *xTensor, Tensor *yTensor, Tensor *outputTensor, ArchInfo_t archInfo) { - if (a.nDims != b.nDims) { - return false; - } else { - for (int i = 0; i < (int)(a.nDims); i++) { - if (a.dims[i] != b.dims[i]) { - return false; - } + TensorDesc xDesc = xTensor->get_desc(); + TensorDesc yDesc = yTensor->get_desc(); + TensorDesc outDesc = (xDesc.nDims > yDesc.nDims) ? xDesc : yDesc; + for (U32 i = 0; i < xDesc.nDims; i++) { + if (xDesc.dims[i] > outDesc.dims[i]) { + outDesc.dims[i] = xDesc.dims[i]; + } + } + for (U32 i = 0; i < yDesc.nDims; i++) { + if (yDesc.dims[i] > outDesc.dims[i]) { + outDesc.dims[i] = yDesc.dims[i]; } } - return true; + outputTensor->resize(outDesc); + return SUCCESS; } -int brocastIndex(TensorDesc inputDesc, TensorDesc conditionDesc) +inline static std::vector get_dims(const TensorDesc &desc) { - if (inputDesc.nDims != conditionDesc.nDims) { - return -1; + std::vector dims; + if (desc.df == DF_NCHWC8) { + dims.push_back(8); } - - for (int i = 2; i < (int)(inputDesc.nDims); i++) { - if (inputDesc.dims[i] != conditionDesc.dims[i]) { - return i; - } + for (U32 i = 0; i < desc.nDims; i++) { + dims.push_back(desc.dims[i]); } - return -1; + return dims; } template -static EE diffSourceWhere(TensorDesc inputDesc, - TensorDesc conditionDesc, - TensorDesc yDesc, - T *inputPtr, - U8 *conditionPtr, - T *yPtr, - T *outputPtr) +static void where_kernel(const TensorDesc &conditionDesc, + const U8 *condition, + const TensorDesc &xDesc, + const T *x, + const TensorDesc &yDesc, + const T *y, + const TensorDesc &outDesc, + T *out) { - if (tensorDescEqual(inputDesc, conditionDesc)) { - for (int i = 0; i < (int)(tensorNumElements(inputDesc)); i++) { - if (tensorNumElements(yDesc) == 1) { - outputPtr[i] = (conditionPtr[i] > 0) ? inputPtr[i] : yPtr[0]; - } else if (tensorNumElements(inputDesc) == tensorNumElements(yDesc)) { - outputPtr[i] = (conditionPtr[i] > 0) ? inputPtr[i] : yPtr[i]; - } else { - return NOT_SUPPORTED; - } + if (tensorNumElements(xDesc) == 1 && + tensorNumElements(conditionDesc) >= outDesc.dims[0] && + tensorNumElements(yDesc) == tensorNumElements(outDesc)) + { + UNI_MEMCPY(out, y, tensorNumBytes(yDesc)); + DataType odt; + DataFormat odf; + U32 on, oc, oh, ow; + if (tensorIs3d(outDesc)) { + CHECK_STATUS(tensor3dGet(outDesc, &odt, &odf, &on, &oc, &ow)); + oh = 1; + } else if (tensorIs4d(outDesc)) { + CHECK_STATUS(tensor4dGet(outDesc, &odt, &odf, &on, &oc, &oh, &ow)); + } else { + UNI_ERROR_LOG("where currently only support 3d/4d tensor.\n"); + return; } - } else { - int bIndex = brocastIndex(inputDesc, conditionDesc); - if (bIndex == -1) { - return NOT_SUPPORTED; + U8 c8 = 1; + if (odf == DF_NCHWC8) { + c8 = 8; } - int batchNum = 1; - for (int i = 0; i < bIndex; i++) { - batchNum *= inputDesc.dims[i]; - } - for (int i = 0; i < (int)(inputDesc.dims[bIndex]); i++) { - for (int j = 0; j < (int)(inputDesc.dims[1]); j++) { - for (int k = 0; k < (int)(inputDesc.dims[0]); k++) { - if (tensorNumElements(yDesc) == 1) { - outputPtr[i * batchNum + j * inputDesc.dims[0] + k] = - conditionPtr[j * conditionDesc.dims[0] + k] > 0 - ? inputPtr[i * batchNum + j * inputDesc.dims[0] + k] - : yPtr[0]; - } else if (tensorNumElements(inputDesc) == tensorNumElements(yDesc)) { - outputPtr[i * batchNum + j * inputDesc.dims[0] + k] = - conditionPtr[j * conditionDesc.dims[0] + k] > 0 - ? inputPtr[i * batchNum + j * inputDesc.dims[0] + k] - : yPtr[i * batchNum + j * inputDesc.dims[0] + k]; - } else { - return NOT_SUPPORTED; + oc /= c8; + for (U32 w = 0; w < ow; w++) { + if (condition[w]) { + for (U32 n = 0; n < on; n++) { + for (U32 c0 = 0; c0 < oc; c0++) { + for (U32 h = 0; h < oh; h++) { + for (U32 c1 = 0; c1 < c8; c1++) { + out[(((n * oc + c0) * oh + h) * ow + w) * c8 + c1] = x[0]; + } + } } } } } + return; + } + U32 length = tensorNumElements(outDesc); + if (xDesc.df != DF_NCHWC8 && yDesc.df != DF_NCHWC8) { + for (U32 i = 0; i < length; i++) { + const std::vector &id = calculateLocalIndex(i, outDesc.dims, outDesc.nDims); + int ci = calculateGlobalIndex(id.data(), conditionDesc.dims, conditionDesc.nDims); + int xi = calculateGlobalIndex(id.data(), xDesc.dims, xDesc.nDims); + int yi = calculateGlobalIndex(id.data(), yDesc.dims, yDesc.nDims); + out[i] = condition[ci] ? x[xi] : y[yi]; + } + return; + } + const std::vector &cdims = get_dims(conditionDesc); + const std::vector &xdims = get_dims(xDesc); + const std::vector &ydims = get_dims(yDesc); + const std::vector &odims = get_dims(outDesc); + std::vector id_c1(odims.size()), id_c8(odims.size() + 1); + U32 *cid = (conditionDesc.nDims == cdims.size()) ? id_c1.data() : id_c8.data(); + U32 *xid = (xDesc.nDims == xdims.size()) ? id_c1.data() : id_c8.data(); + U32 *yid = (yDesc.nDims == ydims.size()) ? id_c1.data() : id_c8.data(); + int axis = outDesc.nDims - 2; + for (U32 i = 0; i < length; i++) { + const std::vector &id = calculateLocalIndex(i, odims.data(), odims.size()); + if (outDesc.nDims != odims.size()) { + UNI_MEMCPY(id_c8.data(), id.data(), id.size() * sizeof(float)); + UNI_MEMCPY(id_c1.data(), id.data() + 1, (id.size() - 1) * sizeof(float)); + id_c1[axis] = id_c1[axis] * 8 + id[0]; + } else { + UNI_MEMCPY(id_c1.data(), id.data(), id.size() * sizeof(float)); + UNI_MEMCPY(id_c8.data() + 1, id.data(), id.size() * sizeof(float)); + id_c8[0] = id[axis] % 8; + id_c8[axis + 1] = id[axis] / 8; + } + int ci = calculateGlobalIndex(cid, cdims.data(), cdims.size()); + int xi = calculateGlobalIndex(xid, xdims.data(), xdims.size()); + int yi = calculateGlobalIndex(yid, ydims.data(), ydims.size()); + out[i] = condition[ci] ? x[xi] : y[yi]; } - return SUCCESS; } -// replaceF -> yTensor -EE where(Tensor inputTensor, - Tensor conditionTensor, - Tensor yTensor, - Tensor outputTensor, - ArchInfo_t archInfo) +EE where( + Tensor conditionTensor, Tensor xTensor, Tensor yTensor, Tensor outputTensor, ArchInfo_t archInfo) { auto arch = archInfo->arch; - void *input = get_ptr_from_tensor(inputTensor, arch); void *condition = get_ptr_from_tensor(conditionTensor, arch); - void *yPtr = get_ptr_from_tensor(yTensor, arch); - void *output = get_ptr_from_tensor(outputTensor, arch); - TensorDesc inputDesc = inputTensor.get_desc(); + void *x = get_ptr_from_tensor(xTensor, arch); + void *y = get_ptr_from_tensor(yTensor, arch); + void *out = get_ptr_from_tensor(outputTensor, arch); TensorDesc conditionDesc = conditionTensor.get_desc(); + TensorDesc xDesc = xTensor.get_desc(); TensorDesc yDesc = yTensor.get_desc(); - - if (inputDesc.dims[1] == 1) { - memcpy(output, input, tensorNumBytes(inputDesc)); - return SUCCESS; - } + TensorDesc outDesc = outputTensor.get_desc(); EE ret = SUCCESS; - switch (inputDesc.dt) { + switch (xDesc.dt) { #ifdef _USE_FP32 case DT_F32: { - ret = diffSourceWhere(inputDesc, conditionDesc, yDesc, (F32 *)input, (U8 *)condition, - (F32 *)yPtr, (F32 *)output); + where_kernel(conditionDesc, (const U8 *)condition, xDesc, (const F32 *)x, yDesc, + (const F32 *)y, outDesc, (F32 *)out); break; } #endif #ifdef _USE_FP16 case DT_F16: { - ret = diffSourceWhere(inputDesc, conditionDesc, yDesc, (F16 *)input, (U8 *)condition, - (F16 *)yPtr, (F16 *)output); + where_kernel(conditionDesc, (const U8 *)condition, xDesc, (const F16 *)x, yDesc, + (const F16 *)y, outDesc, (F16 *)out); break; } #endif diff --git a/compute/tensor/tests/test_activation.cpp b/compute/tensor/tests/test_activation.cpp index 02b7e443..5c2bb229 100644 --- a/compute/tensor/tests/test_activation.cpp +++ b/compute/tensor/tests/test_activation.cpp @@ -23,7 +23,7 @@ int activationFunctionTest(U32 in, const char *activationType) { DataFormat df = DF_NCHWC8; - memset(activationDesc.value, 0, sizeof(activationDesc.value)); + UNI_MEMSET(activationDesc.value, 0, sizeof(activationDesc.value)); TensorDesc dataDesc = tensor4df(dt, df, in, ic, ih, iw); U32 len = tensorNumElements(dataDesc); @@ -32,8 +32,8 @@ int activationFunctionTest(U32 in, Tensor dataTensor = Tensor::alloc_sized(dataDesc); Tensor dataTensorRef = Tensor::alloc_sized(dataDesc); - memcpy(get_ptr_from_tensor(dataTensor, CPU_GENERAL), data, tensorNumBytes(dataDesc)); - memcpy(get_ptr_from_tensor(dataTensorRef, CPU_GENERAL), data, tensorNumBytes(dataDesc)); + UNI_MEMCPY(get_ptr_from_tensor(dataTensor, CPU_GENERAL), data, tensorNumBytes(dataDesc)); + UNI_MEMCPY(get_ptr_from_tensor(dataTensorRef, CPU_GENERAL), data, tensorNumBytes(dataDesc)); if (UT_CHECK) { //check diff --git a/compute/tensor/tests/test_argmax.cpp b/compute/tensor/tests/test_argmax.cpp index cec11ded..05f2219f 100644 --- a/compute/tensor/tests/test_argmax.cpp +++ b/compute/tensor/tests/test_argmax.cpp @@ -30,7 +30,7 @@ int argmaxTest(int argc, char **argv, DataType dt) Tensor inputTensor; inputTensor.resize(inDesc); inputTensor.alloc(); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc)); Tensor outputTensor; Tensor outputTensorRef; diff --git a/compute/tensor/tests/test_attention.cpp b/compute/tensor/tests/test_attention.cpp index 0e3fe0e1..c1b0d0cb 100644 --- a/compute/tensor/tests/test_attention.cpp +++ b/compute/tensor/tests/test_attention.cpp @@ -56,7 +56,7 @@ int attentionTest(int argc, char **argv, DataType dt) } } - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc)); if (UT_CHECK) { CHECK_STATUS(attention(inputTensor, outputTensor, &UT_CPU_ARCHINFO)); diff --git a/compute/tensor/tests/test_axpby.cpp b/compute/tensor/tests/test_axpby.cpp index 02f9cfa9..23188c45 100644 --- a/compute/tensor/tests/test_axpby.cpp +++ b/compute/tensor/tests/test_axpby.cpp @@ -28,7 +28,7 @@ int axpbyTest(int argc, char **argv, DataType dt) U8 *y = ut_input_v(len, dt, UT_INIT_RANDOM); U8 *y_ref = ut_input_v(len, dt, UT_INIT_ZERO); - memcpy(y_ref, y, tensorNumBytes(yDesc)); + UNI_MEMCPY(y_ref, y, tensorNumBytes(yDesc)); // check if (UT_CHECK) { CHECK_STATUS(vector_vector_axpby(a, xDesc, x, b, yDesc, y, UT_CPU_ARCH)); diff --git a/compute/tensor/tests/test_check.cpp b/compute/tensor/tests/test_check.cpp index 2acb8965..e6c664ea 100644 --- a/compute/tensor/tests/test_check.cpp +++ b/compute/tensor/tests/test_check.cpp @@ -24,28 +24,23 @@ int checkTest(int argc, char **argv, DataType dt) DataFormat df = DF_NCHW; CheckParamSpec p; - p.check_mode = CHECK_EQUAL; + p.mode = CHECK_EQUAL; TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); U8 *inputA = ut_input_v(tensorNumElements(inDesc), dt, UT_INIT_RANDOM); U8 *inputB = ut_input_v(tensorNumElements(inDesc), dt, UT_INIT_RANDOM); - Tensor inputTensorA; - Tensor inputTensorB; - inputTensorA.resize(inDesc); - inputTensorB.resize(inDesc); - inputTensorA.alloc(); - inputTensorB.alloc(); - memcpy(get_ptr_from_tensor(inputTensorA, CPU_GENERAL), inputA, tensorNumBytes(inDesc)); - memcpy(get_ptr_from_tensor(inputTensorB, CPU_GENERAL), inputB, tensorNumBytes(inDesc)); + Tensor inputTensorA = Tensor::alloc_sized(inDesc); + Tensor inputTensorB = Tensor::alloc_sized(inDesc); + UNI_MEMCPY(get_ptr_from_tensor(inputTensorA, CPU_GENERAL), inputA, tensorNumBytes(inDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensorB, CPU_GENERAL), inputB, tensorNumBytes(inDesc)); Tensor outputTensor; - Tensor outputTensorRef; CHECK_STATUS( check_infer_output_size({&inputTensorA, &inputTensorB}, &outputTensor, &UT_CPU_ARCHINFO)); outputTensor.alloc(); - outputTensorRef.resize(outputTensor.get_desc()); - outputTensorRef.alloc(); + TensorDesc outDesc = outputTensor.get_desc(); + Tensor outputTensorRef = Tensor::alloc_sized(outDesc); if (UT_CHECK) { CHECK_STATUS(check(inputTensorA, inputTensorB, p, outputTensor, &UT_CPU_ARCHINFO)); @@ -55,7 +50,7 @@ int checkTest(int argc, char **argv, DataType dt) // check ut_check_v(get_ptr_from_tensor(outputTensor, CPU_GENERAL), - get_ptr_from_tensor(outputTensorRef, CPU_GENERAL), outputTensor.length(), DT_I32, 0, + get_ptr_from_tensor(outputTensorRef, CPU_GENERAL), outputTensor.length(), outDesc.dt, 0, __FILE__, __LINE__); } diff --git a/compute/tensor/tests/test_clip.cpp b/compute/tensor/tests/test_clip.cpp index ea447ea9..2baca31c 100644 --- a/compute/tensor/tests/test_clip.cpp +++ b/compute/tensor/tests/test_clip.cpp @@ -27,7 +27,7 @@ int clipTest(int argc, char **argv, DataType dt) Tensor inputTensor; inputTensor.resize(inDesc); inputTensor.alloc(); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc)); Tensor outputTensor; Tensor outputTensorRef; diff --git a/compute/tensor/tests/test_concat.cpp b/compute/tensor/tests/test_concat.cpp index 750308ec..433dbd59 100644 --- a/compute/tensor/tests/test_concat.cpp +++ b/compute/tensor/tests/test_concat.cpp @@ -60,7 +60,7 @@ int concatTest(int argc, char **argv, DataType dt) // setup tmp U32 tmpBytes; - CHECK_STATUS(concat_infer_forward_tmp_bytes(inTensors, &tmpBytes, &UT_CPU_ARCHINFO)); + CHECK_STATUS(concat_infer_forward_tmp_bytes(inTensors, outTensor, &tmpBytes, &UT_CPU_ARCHINFO)); Tensor tmpTensor; tmpTensor.resize(tensor1d(DT_U8, tmpBytes)); tmpTensor.alloc(); @@ -85,7 +85,7 @@ int concatTest(int argc, char **argv, DataType dt) transformToNCHW(inputDesc, srcPtr, tmpDesc, tmpPtr); srcPtr = tmpPtr; } - memcpy(outputRef + count, srcPtr, bytes); + UNI_MEMCPY(outputRef + count, srcPtr, bytes); count += bytes; tmpPtr += bytes; } diff --git a/compute/tensor/tests/test_concat_int8.cpp b/compute/tensor/tests/test_concat_int8.cpp index 16f8088f..e9f11c13 100644 --- a/compute/tensor/tests/test_concat_int8.cpp +++ b/compute/tensor/tests/test_concat_int8.cpp @@ -75,7 +75,7 @@ int int8ConcatTest(int argc, char **argv, DataType dt) U8 *tmp = (U8 *)ut_input_v(in_len, dt, UT_INIT_ZERO); U8 *out_d = (U8 *)ut_input_v(in_len, dt, UT_INIT_ZERO); for (int i = 0, index = 0; i < num; i++) { - memcpy(tmp + index, get_ptr_from_tensor(inTensorsRef[i], CPU_GENERAL), + UNI_MEMCPY(tmp + index, get_ptr_from_tensor(inTensorsRef[i], CPU_GENERAL), inTensorsRef[i].bytes()); index += inTensorsRef[i].bytes(); } diff --git a/compute/tensor/tests/test_concat_ocl.cpp b/compute/tensor/tests/test_concat_ocl.cpp index 6ab06559..055957ce 100644 --- a/compute/tensor/tests/test_concat_ocl.cpp +++ b/compute/tensor/tests/test_concat_ocl.cpp @@ -85,7 +85,7 @@ int concatTest(int argc, char **argv, DataType dt) U32 maxBytes = 0; U32 tmpBytes = 0; - CHECK_STATUS(concat_infer_forward_tmp_bytes(inputTensor, &tmpBytes, &archInfo)); + CHECK_STATUS(concat_infer_forward_tmp_bytes(inputTensor, outputTensor, &tmpBytes, &archInfo)); maxBytes = (tmpBytes > maxBytes) ? tmpBytes : maxBytes; GCLMem_t output = alloc(outputTensor); @@ -129,7 +129,7 @@ int concatTest(int argc, char **argv, DataType dt) for (int i = 0; i < num; i++) { inputTensorCpu[i].alloc(); inputDesc[i].df = DF_NCHW; - memcpy(get_ptr_from_tensor(inputTensorCpu[i], CPU_GENERAL), input_cpu[i], + UNI_MEMCPY(get_ptr_from_tensor(inputTensorCpu[i], CPU_GENERAL), input_cpu[i], tensorNumBytes(inputDesc[i])); } diff --git a/compute/tensor/tests/test_convolution.cpp b/compute/tensor/tests/test_convolution.cpp index c8395c66..5877bdec 100644 --- a/compute/tensor/tests/test_convolution.cpp +++ b/compute/tensor/tests/test_convolution.cpp @@ -51,7 +51,7 @@ int convolutionTest(int argc, char *argv[], DataType dt) TensorDesc filterDesc = tensor4df(dt, DF_NCHW, fn, fc, fh, fw); TensorDesc biasDesc = tensor1d(dt, oc); ConvolutionParamSpec p = createConvolutionParamSpec(group, 1, fh, fw, 1, stride, stride, 0, 0, - padding, padding, padding, padding, 1, 1, 1, fn, Convolution_Depthwise_Pointwise); + padding, padding, padding, padding, 1, 1, 1, fn, CONVOLUTION_DEPTHWISE_POINTWISE); // setup input, filter, bias U8 *input = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); @@ -76,12 +76,15 @@ int convolutionTest(int argc, char *argv[], DataType dt) filterTensor.alloc(); filterTensorRef.alloc(); biasTensor.alloc(); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); - memcpy(get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); - memcpy(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, bytesOf(dt) * fn * fc * fh * fw); - memcpy( + UNI_MEMCPY( + get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); + UNI_MEMCPY( + get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, bytesOf(dt) * fn * fc * fh * fw); + UNI_MEMCPY( get_ptr_from_tensor(filterTensorRef, CPU_GENERAL), filter, bytesOf(dt) * fn * fc * fh * fw); - memcpy(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, bytesOf(dt) * oc); + UNI_MEMCPY(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, bytesOf(dt) * oc); // setup output, bias CHECK_STATUS(convolution_infer_output_size( diff --git a/compute/tensor/tests/test_convolution_bnn.cpp b/compute/tensor/tests/test_convolution_bnn.cpp index 765ad294..0a44058c 100644 --- a/compute/tensor/tests/test_convolution_bnn.cpp +++ b/compute/tensor/tests/test_convolution_bnn.cpp @@ -14,7 +14,7 @@ #include "tensor_computing.h" #include "ut_util.h" -int bnnConvolutionTest(int argc, char *argv[], DataType dt) +int bnnConvolutionTest(int argc, char *argv[], DataType idt, DataType fdt) { CHECK_REQUIREMENT(argc == 16); // in data @@ -39,21 +39,20 @@ int bnnConvolutionTest(int argc, char *argv[], DataType dt) CHECK_REQUIREMENT(in == 1 && on == 1); - DataType fdt = DT_BIN11; // Use dt to distinguish DoReFa and XNOR ActivationParamSpec activationDesc; activationDesc.mode = ACTIVATION_NULL; - TensorDesc inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); + TensorDesc inputDesc = tensor4df(idt, DF_NCHWC8, in, ic, ih, iw); TensorDesc filterDesc = tensor4df(fdt, DF_NCHW, oc, ic, fh, fw); - TensorDesc biasDesc = tensor1d(dt, oc * 2); // including scale and bias + TensorDesc biasDesc = tensor1d(idt, oc * 2); // including scale and bias ConvolutionParamSpec p = createConvolutionParamSpec(group, 1, fh, fw, 1, stride, stride, 0, 0, - padding, padding, padding, padding, 1, 1, 1, oc, Convolution_Depthwise_Pointwise); + padding, padding, padding, padding, 1, 1, 1, oc, CONVOLUTION_DEPTHWISE_POINTWISE); // setup input, filter, bias - U8 *input = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); + U8 *input = ut_input_v(in * ic * ih * iw, idt, UT_INIT_RANDOM); if (fdt == DT_BIN01) { for (U32 i = 0; i < in * ic * ih * iw; i++) { - switch (dt) { + switch (idt) { #ifdef _USE_FP16 case DT_F16: ((F16 *)input)[i] += 0.5; @@ -71,7 +70,7 @@ int bnnConvolutionTest(int argc, char *argv[], DataType dt) } BIN8 *filter = (BIN8 *)ut_input_v(fn * fc * fh * fw / 8, fdt, UT_INIT_POS); - U8 *bias = ut_input_v(oc * 2, dt, UT_INIT_RANDOM); + U8 *bias = ut_input_v(oc * 2, idt, UT_INIT_RANDOM); Tensor inputTensor; Tensor inputTensorRef; Tensor filterTensor; @@ -91,15 +90,18 @@ int bnnConvolutionTest(int argc, char *argv[], DataType dt) filterTensor.alloc(); filterTensorRef.alloc(); biasTensor.alloc(); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); - memcpy(get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); - memcpy(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, tensorNumBytes(filterDesc)); - memcpy(get_ptr_from_tensor(filterTensorRef, CPU_GENERAL), filter, tensorNumBytes(filterDesc)); - memcpy(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, tensorNumBytes(biasDesc)); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(idt) * in * ic * ih * iw); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(idt) * in * ic * ih * iw); + UNI_MEMCPY(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, tensorNumBytes(filterDesc)); + UNI_MEMCPY( + get_ptr_from_tensor(filterTensorRef, CPU_GENERAL), filter, tensorNumBytes(filterDesc)); + UNI_MEMCPY(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, tensorNumBytes(biasDesc)); // setup output, bias CHECK_STATUS(convolution_infer_output_size( - &inputTensor, filterTensor, p, &outputTensor, dt, &UT_CPU_ARCHINFO)); + &inputTensor, filterTensor, p, &outputTensor, idt, &UT_CPU_ARCHINFO)); outputTensor.alloc(); outputTensorRef.resize(outputTensor.get_desc()); @@ -141,7 +143,7 @@ int bnnConvolutionTest(int argc, char *argv[], DataType dt) tmpTensors, outputTensorRef, activationDesc, &UT_SERIAL_ARCHINFO)); // check ut_check_v(get_ptr_from_tensor(outputTensor, CPU_GENERAL), - get_ptr_from_tensor(outputTensorRef, CPU_GENERAL), outputTensor.length(), dt, 1, + get_ptr_from_tensor(outputTensorRef, CPU_GENERAL), outputTensor.length(), idt, 1, __FILE__, __LINE__); } @@ -172,7 +174,8 @@ int bnnConvolutionTest(int argc, char *argv[], DataType dt) int main(int argc, char **argv) { #ifdef _USE_FP16 - bnnConvolutionTest(argc, argv, DT_F16); + bnnConvolutionTest(argc, argv, DT_F16, DT_BIN01); + bnnConvolutionTest(argc, argv, DT_F16, DT_BIN11); #endif return 0; } diff --git a/compute/tensor/tests/test_convolution_int8.cpp b/compute/tensor/tests/test_convolution_int8.cpp index c39f1fd7..8f569f04 100644 --- a/compute/tensor/tests/test_convolution_int8.cpp +++ b/compute/tensor/tests/test_convolution_int8.cpp @@ -45,7 +45,7 @@ int int8ConvolutionTest(int argc, char *argv[], DataType dt, DataType filterData TensorDesc inputDesc, filterDesc, outputDesc, biasDesc; ConvolutionParamSpec p = createConvolutionParamSpec(group, 1, fh, fw, 1, stride, stride, 0, 0, - padding, padding, padding, padding, 1, 1, 1, fn, Convolution_Depthwise_Pointwise); + padding, padding, padding, padding, 1, 1, 1, fn, CONVOLUTION_DEPTHWISE_POINTWISE); if (ic % 8 != 0) { printf("[WARN] can not quantize the first layer\n"); @@ -120,10 +120,8 @@ int int8ConvolutionTest(int argc, char *argv[], DataType dt, DataType filterData CHECK_STATUS(convolution_transform_filter( filterTensor, p, alg, tmpTensor, &tFilter, &UT_CPU_ARCHINFO)); - TensorDesc ftmDesc = tFilter.get_desc(); - ftmDesc.dt = DT_I8; - ftmTensor = Tensor::alloc_sized(ftmDesc); - + U32 ftmBytes = ftBytes / bytesOf(filterDataType); + ftmTensor = Tensor::alloc_sized(tensor1d(DT_U8, ftmBytes)); scales = std::vector(38); CHECK_STATUS(quantize(tFilter, &ftmTensor, scales.data() + 2, &UT_CPU_ARCHINFO)); break; @@ -159,7 +157,7 @@ int int8ConvolutionTest(int argc, char *argv[], DataType dt, DataType filterData // TensorDesc inputC16Desc = inputDesc; // inputC16Desc.df = DF_NCHWC16; // transformToNCHWC16(inputDesc, (void *)get_ptr_from_tensor(inputTensor, CPU_GENERAL), inputC16Desc, inputC16); - // memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), inputC16, tensorNumBytes(inputDesc)); + // UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), inputC16, tensorNumBytes(inputDesc)); // inputTensor.resize(inputC16Desc); // free(inputC16); // #endif diff --git a/compute/tensor/tests/test_convolution_ocl.cpp b/compute/tensor/tests/test_convolution_ocl.cpp index 080049fa..e8479769 100644 --- a/compute/tensor/tests/test_convolution_ocl.cpp +++ b/compute/tensor/tests/test_convolution_ocl.cpp @@ -96,7 +96,7 @@ int convolutionTest(int argc, char *argv[], DataType dt) dilationH = atoi(argv[14]); dilationW = atoi(argv[15]); if (argc == 17) { - use_nchw = atoi(argv[6]); + use_nchw = atoi(argv[16]); } } @@ -136,7 +136,7 @@ int convolutionTest(int argc, char *argv[], DataType dt) activationDesc.mode = ACTIVATION_NULL; ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, ft, fh, fw, strideT, strideH, strideW, paddingTF, paddingTB, paddingT, paddingB, paddingL, paddingR, 1, - dilationH, dilationW, fn, Convolution_Depthwise_Pointwise); + dilationH, dilationW, fn, CONVOLUTION_DEPTHWISE_POINTWISE); TensorDesc inputDesc, filterDesc, inputDesc_gpu; if (it > 1) { @@ -220,7 +220,7 @@ int convolutionTest(int argc, char *argv[], DataType dt) U32 ocAlign = (oc + 3) / 4 * 4; if (ocAlign != oc) { U8 *bias_cpu_align = ut_input_v(ocAlign, dt, UT_INIT_ZERO); - memcpy(bias_cpu_align, bias_cpu, oc * bytesOf(dt)); + UNI_MEMCPY(bias_cpu_align, bias_cpu, oc * bytesOf(dt)); free(bias_cpu); bias_cpu = bias_cpu_align; } @@ -245,7 +245,7 @@ int convolutionTest(int argc, char *argv[], DataType dt) tmp[0] = tmpTensorImgA; } alloc_img(tmpTensorImgB, maxBytes + 4); - Tensor filterTensorTran = filterTensor; + Tensor filterTensorTran = filterTensor; if (alg == CONVOLUTION_ALGORITHM_WINOGRAD && archInfo.arch == QUALCOMM) { tmp[0] = tmpTensor; @@ -260,8 +260,8 @@ int convolutionTest(int argc, char *argv[], DataType dt) CHECK_STATUS(ocl_set_input(handle, input, inputDesc_gpu, input_cpu, tmpbuf, true)); std::vector inputTensors(1, inputTensor); - CHECK_STATUS(convolution(inputTensors, filterTensorTran, convParamSpec, alg, nullptr, biasTensor, - tmp, outputTensor, activationDesc, &archInfo)); + CHECK_STATUS(convolution(inputTensors, filterTensorTran, convParamSpec, alg, nullptr, + biasTensor, tmp, outputTensor, activationDesc, &archInfo)); /*warp up*/ for (U32 i = 0; i < 2; i++) { @@ -314,18 +314,19 @@ int convolutionTest(int argc, char *argv[], DataType dt) Tensor inputTensorCpu; inputTensorCpu.resize(inputDesc); inputTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc)); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc)); Tensor filterTensorCpu; filterTensorCpu.resize(filterDesc); filterTensorCpu.alloc(); - memcpy( + UNI_MEMCPY( get_ptr_from_tensor(filterTensorCpu, CPU_GENERAL), filter_cpu, tensorNumBytes(filterDesc)); Tensor biasTensorCpu; biasTensorCpu.resize(biasDesc); biasTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(biasTensorCpu, CPU_GENERAL), bias_cpu, tensorNumBytes(biasDesc)); + UNI_MEMCPY(get_ptr_from_tensor(biasTensorCpu, CPU_GENERAL), bias_cpu, tensorNumBytes(biasDesc)); Tensor outputTensorCpu; outputDesc.df = DF_NCHW; diff --git a/compute/tensor/tests/test_deconvolution.cpp b/compute/tensor/tests/test_deconvolution.cpp index adcd9104..154f40c1 100644 --- a/compute/tensor/tests/test_deconvolution.cpp +++ b/compute/tensor/tests/test_deconvolution.cpp @@ -36,13 +36,12 @@ int deconvolutionTest(int argc, char **argv, DataType dt) U32 oc = atoi(argv[13]); U32 oh = atoi(argv[14]); U32 ow = atoi(argv[15]); - CHECK_REQUIREMENT(in == 1 && on == 1); CHECK_REQUIREMENT(ic % 8 == 0 && oc % 8 == 0); ActivationParamSpec activationDesc; activationDesc.mode = ACTIVATION_NULL; ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, 1, fh, fw, 1, stride, - stride, 0, 0, padding, padding, padding, padding, 1, 1, 1, fn, Convolution_Deconvolution); + stride, 0, 0, padding, padding, padding, padding, 1, 1, 1, fn, CONVOLUTION_DECONVOLUTION); TensorDesc outputDesc; TensorDesc inputDesc = tensor4df(dt, DF_NCHWC8, in, ic, ih, iw); @@ -73,11 +72,14 @@ int deconvolutionTest(int argc, char **argv, DataType dt) filterTensor.alloc(); filterTensorRef.alloc(); biasTensor.alloc(); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); - memcpy(get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); - memcpy(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, tensorNumBytes(filterDesc)); - memcpy(get_ptr_from_tensor(filterTensorRef, CPU_GENERAL), filter, tensorNumBytes(filterDesc)); - memcpy(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, tensorNumBytes(biasDesc)); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); + UNI_MEMCPY(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, tensorNumBytes(filterDesc)); + UNI_MEMCPY( + get_ptr_from_tensor(filterTensorRef, CPU_GENERAL), filter, tensorNumBytes(filterDesc)); + UNI_MEMCPY(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, tensorNumBytes(biasDesc)); // setup output, bias CHECK_STATUS(deconvolution_infer_output_size( diff --git a/compute/tensor/tests/test_deconvolution_ocl.cpp b/compute/tensor/tests/test_deconvolution_ocl.cpp index 86b81227..701664c9 100644 --- a/compute/tensor/tests/test_deconvolution_ocl.cpp +++ b/compute/tensor/tests/test_deconvolution_ocl.cpp @@ -53,7 +53,7 @@ int deconvolutionTest(int argc, char *argv[], DataType dt) ActivationParamSpec activationDesc; activationDesc.mode = ACTIVATION_NULL; ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, 1, fh, fw, 1, stride, - stride, 0, 0, padding, padding, padding, padding, 1, 1, 1, fn, Convolution_Deconvolution); + stride, 0, 0, padding, padding, padding, padding, 1, 1, 1, fn, CONVOLUTION_DECONVOLUTION); TensorDesc inputDesc = tensor4df(dt, DF_NCHW, in, ic, ih, iw); TensorDesc filterDesc = tensor4df(dt, DF_NCHW, fn, fc, fh, fw); @@ -116,7 +116,7 @@ int deconvolutionTest(int argc, char *argv[], DataType dt) if ((oc & 3) != 0) { U32 ocAlign = (oc + 3) / 4 * 4; U8 *bias_cpu_align = ut_input_v(ocAlign, dt, UT_INIT_ZERO); - memcpy(bias_cpu_align, bias_cpu, oc * bytesOf(dt)); + UNI_MEMCPY(bias_cpu_align, bias_cpu, oc * bytesOf(dt)); free(bias_cpu); bias_cpu = bias_cpu_align; } @@ -161,18 +161,19 @@ int deconvolutionTest(int argc, char *argv[], DataType dt) Tensor inputTensorCpu; inputTensorCpu.resize(inputDesc); inputTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc)); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc)); Tensor filterTensorCpu; filterTensorCpu.resize(filterDesc); filterTensorCpu.alloc(); - memcpy( + UNI_MEMCPY( get_ptr_from_tensor(filterTensorCpu, CPU_GENERAL), filter_cpu, tensorNumBytes(filterDesc)); Tensor biasTensorCpu; biasTensorCpu.resize(biasDesc); biasTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(biasTensorCpu, CPU_GENERAL), bias_cpu, tensorNumBytes(biasDesc)); + UNI_MEMCPY(get_ptr_from_tensor(biasTensorCpu, CPU_GENERAL), bias_cpu, tensorNumBytes(biasDesc)); Tensor outputTensorCpu; outputTensorCpu.resize(outputDesc); diff --git a/compute/tensor/tests/test_depthwise_convolution.cpp b/compute/tensor/tests/test_depthwise_convolution.cpp index 3679b03d..b8f4035a 100644 --- a/compute/tensor/tests/test_depthwise_convolution.cpp +++ b/compute/tensor/tests/test_depthwise_convolution.cpp @@ -52,7 +52,7 @@ int depthwiseConvolutionTest(int argc, char *argv[], bool isFusedWithPw, DataTyp pwBiasDesc = tensor1d(dt, oc); } ConvolutionParamSpec p = createConvolutionParamSpec(group, 1, fh, fw, 1, stride, stride, 0, 0, - padding, padding, padding, padding, 1, 1, 1, fn, Convolution_Depthwise); + padding, padding, padding, padding, 1, 1, 1, fn, CONVOLUTION_DEPTHWISE); // setup input, filter, bias U8 *dwFilter = nullptr; @@ -82,13 +82,15 @@ int depthwiseConvolutionTest(int argc, char *argv[], bool isFusedWithPw, DataTyp dwFilterTensor.alloc(); dwFilterTensorRef.alloc(); dwBiasTensor.alloc(); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); - memcpy(get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); - memcpy( + UNI_MEMCPY( + get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); + UNI_MEMCPY( get_ptr_from_tensor(dwFilterTensor, CPU_GENERAL), dwFilter, bytesOf(dt) * 1 * ic * fh * fw); - memcpy(get_ptr_from_tensor(dwFilterTensorRef, CPU_GENERAL), dwFilter, + UNI_MEMCPY(get_ptr_from_tensor(dwFilterTensorRef, CPU_GENERAL), dwFilter, bytesOf(dt) * 1 * ic * fh * fw); - memcpy(get_ptr_from_tensor(dwBiasTensor, CPU_GENERAL), dwBias, bytesOf(dt) * ic); + UNI_MEMCPY(get_ptr_from_tensor(dwBiasTensor, CPU_GENERAL), dwBias, bytesOf(dt) * ic); Tensor pwFilterTensor; Tensor pwFilterTensorRef; Tensor pwBiasTensor; @@ -101,11 +103,11 @@ int depthwiseConvolutionTest(int argc, char *argv[], bool isFusedWithPw, DataTyp pwFilterTensor.alloc(); pwFilterTensorRef.alloc(); pwBiasTensor.alloc(); - memcpy(get_ptr_from_tensor(pwFilterTensor, CPU_GENERAL), pwFilter, + UNI_MEMCPY(get_ptr_from_tensor(pwFilterTensor, CPU_GENERAL), pwFilter, bytesOf(dt) * oc * ic * 1 * 1); - memcpy(get_ptr_from_tensor(pwFilterTensorRef, CPU_GENERAL), pwFilter, + UNI_MEMCPY(get_ptr_from_tensor(pwFilterTensorRef, CPU_GENERAL), pwFilter, bytesOf(dt) * oc * ic * 1 * 1); - memcpy(get_ptr_from_tensor(pwBiasTensor, CPU_GENERAL), pwBias, bytesOf(dt) * oc); + UNI_MEMCPY(get_ptr_from_tensor(pwBiasTensor, CPU_GENERAL), pwBias, bytesOf(dt) * oc); } // setup output, bias @@ -183,21 +185,21 @@ int depthwiseConvolutionTest(int argc, char *argv[], bool isFusedWithPw, DataTyp if (UT_CHECK) { if (isFusedWithPw) { CHECK_STATUS(depthwise_pointwise_convolution(inputTensors, dwFtmTensor, pwFtmTensor, p, - alg, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor, dwActivationParamSpec, - pwActivationParamSpec, &UT_CPU_ARCHINFO)); + alg, nullptr, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor, + dwActivationParamSpec, pwActivationParamSpec, &UT_CPU_ARCHINFO)); // naive implement CHECK_STATUS(depthwise_pointwise_convolution(inputTensorsRef, dwFilterTensorRef, - pwFilterTensorRef, p, alg, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensorRef, - dwActivationParamSpec, pwActivationParamSpec, &UT_SERIAL_ARCHINFO)); + pwFilterTensorRef, p, alg, nullptr, dwBiasTensor, pwBiasTensor, tmpTensors, + outputTensorRef, dwActivationParamSpec, pwActivationParamSpec, &UT_SERIAL_ARCHINFO)); } else { - CHECK_STATUS(depthwise_convolution(inputTensor, dwFtmTensor, p, alg, dwBiasTensor, - tmpTensor, outputTensor, dwActivationParamSpec, &UT_CPU_ARCHINFO)); + CHECK_STATUS(depthwise_convolution(inputTensor, dwFtmTensor, p, alg, nullptr, + dwBiasTensor, tmpTensor, outputTensor, dwActivationParamSpec, &UT_CPU_ARCHINFO)); // naive implement - CHECK_STATUS( - depthwise_convolution(inputTensorRef, dwFilterTensorRef, p, alg, dwBiasTensor, - tmpTensor, outputTensorRef, dwActivationParamSpec, &UT_SERIAL_ARCHINFO)); + CHECK_STATUS(depthwise_convolution(inputTensorRef, dwFilterTensorRef, p, alg, nullptr, + dwBiasTensor, tmpTensor, outputTensorRef, dwActivationParamSpec, + &UT_SERIAL_ARCHINFO)); } // check @@ -211,11 +213,11 @@ int depthwiseConvolutionTest(int argc, char *argv[], bool isFusedWithPw, DataTyp for (int iter = 0; iter < UT_LOOPS; iter++) { if (isFusedWithPw) { CHECK_STATUS(depthwise_pointwise_convolution(inputTensors, dwFtmTensor, pwFtmTensor, p, - alg, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor, dwActivationParamSpec, - pwActivationParamSpec, &UT_CPU_ARCHINFO)); + alg, nullptr, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor, + dwActivationParamSpec, pwActivationParamSpec, &UT_CPU_ARCHINFO)); } else { - CHECK_STATUS(depthwise_convolution(inputTensor, dwFtmTensor, p, alg, dwBiasTensor, - tmpTensor, outputTensor, dwActivationParamSpec, &UT_CPU_ARCHINFO)); + CHECK_STATUS(depthwise_convolution(inputTensor, dwFtmTensor, p, alg, nullptr, + dwBiasTensor, tmpTensor, outputTensor, dwActivationParamSpec, &UT_CPU_ARCHINFO)); } } double time_end = ut_time_ms(); diff --git a/compute/tensor/tests/test_depthwise_convolution_int8.cpp b/compute/tensor/tests/test_depthwise_convolution_int8.cpp index df11b4cf..52f5d09f 100644 --- a/compute/tensor/tests/test_depthwise_convolution_int8.cpp +++ b/compute/tensor/tests/test_depthwise_convolution_int8.cpp @@ -53,7 +53,7 @@ int main(int argc, char *argv[]) dwBiasDesc = tensor1d(odt, ic); pwBiasDesc = tensor1d(odt, oc); ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, 1, fh, fw, 1, stride, - stride, 0, 0, padding, padding, padding, padding, 1, 1, 1, fn, Convolution_Depthwise); + stride, 0, 0, padding, padding, padding, padding, 1, 1, 1, fn, CONVOLUTION_DEPTHWISE); // setup input, filter, bias INT8 *input = (INT8 *)ut_input_v(in * ic * ih * iw, DT_I8, UT_INIT_RANDOM); @@ -81,13 +81,15 @@ int main(int argc, char *argv[]) dwFilterTensor.alloc(); dwFilterTensorRef.alloc(); dwBiasTensor.alloc(); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); - memcpy(get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); - memcpy( + UNI_MEMCPY( + get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); + UNI_MEMCPY( get_ptr_from_tensor(dwFilterTensor, CPU_GENERAL), dwFilter, bytesOf(dt) * 1 * ic * fh * fw); - memcpy(get_ptr_from_tensor(dwFilterTensorRef, CPU_GENERAL), dwFilter, + UNI_MEMCPY(get_ptr_from_tensor(dwFilterTensorRef, CPU_GENERAL), dwFilter, bytesOf(dt) * 1 * ic * fh * fw); - memcpy(get_ptr_from_tensor(dwBiasTensor, CPU_GENERAL), dwBias, bytesOf(dt) * ic); + UNI_MEMCPY(get_ptr_from_tensor(dwBiasTensor, CPU_GENERAL), dwBias, bytesOf(dt) * ic); Tensor pwFilterTensor; Tensor pwFilterTensorRef; @@ -98,11 +100,11 @@ int main(int argc, char *argv[]) pwFilterTensor.alloc(); pwFilterTensorRef.alloc(); pwBiasTensor.alloc(); - memcpy( + UNI_MEMCPY( get_ptr_from_tensor(pwFilterTensor, CPU_GENERAL), pwFilter, bytesOf(dt) * oc * ic * 1 * 1); - memcpy(get_ptr_from_tensor(pwFilterTensorRef, CPU_GENERAL), pwFilter, + UNI_MEMCPY(get_ptr_from_tensor(pwFilterTensorRef, CPU_GENERAL), pwFilter, bytesOf(dt) * oc * ic * 1 * 1); - memcpy(get_ptr_from_tensor(pwBiasTensor, CPU_GENERAL), pwBias, bytesOf(dt) * oc); + UNI_MEMCPY(get_ptr_from_tensor(pwBiasTensor, CPU_GENERAL), pwBias, bytesOf(dt) * oc); // setup output, bias CHECK_STATUS(depthwise_pointwise_convolution_infer_output_size(&inputTensor, dwFilterTensor, @@ -143,15 +145,16 @@ int main(int argc, char *argv[]) std::vector inputTensors(1, inputTensor); std::vector inputTensorsRef(1, inputTensorRef); std::vector tmpTensors(1, tmpTensor); + F32 scales[3] = {1, 1, 1}; if (UT_CHECK) { CHECK_STATUS(depthwise_pointwise_convolution(inputTensors, dwFtmTensor, pwFtmTensor, - convParamSpec, alg, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor, + convParamSpec, alg, scales, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor, dwActivationParamSpec, pwActivationParamSpec, &UT_CPU_ARCHINFO)); // naive implement CHECK_STATUS(depthwise_pointwise_convolution(inputTensorsRef, dwFilterTensorRef, - pwFilterTensorRef, convParamSpec, alg, dwBiasTensor, pwBiasTensor, tmpTensors, + pwFilterTensorRef, convParamSpec, alg, scales, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensorRef, dwActivationParamSpec, pwActivationParamSpec, &UT_SERIAL_ARCHINFO)); // check @@ -164,7 +167,7 @@ int main(int argc, char *argv[]) double time_start = ut_time_ms(); for (int iter = 0; iter < UT_LOOPS; iter++) { CHECK_STATUS(depthwise_pointwise_convolution(inputTensors, dwFtmTensor, pwFtmTensor, - convParamSpec, alg, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor, + convParamSpec, alg, scales, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor, dwActivationParamSpec, pwActivationParamSpec, &UT_CPU_ARCHINFO)); } double time_end = ut_time_ms(); diff --git a/compute/tensor/tests/test_depthwise_convolution_ocl.cpp b/compute/tensor/tests/test_depthwise_convolution_ocl.cpp index bc977906..6cc96821 100644 --- a/compute/tensor/tests/test_depthwise_convolution_ocl.cpp +++ b/compute/tensor/tests/test_depthwise_convolution_ocl.cpp @@ -82,7 +82,7 @@ int depthwiseConvolutionTest(int argc, char *argv[], DataFormat filterDataFormat dwActivationParamSpec.mode = ACTIVATION_NULL; ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, 1, fh, fw, 1, stride, stride, 0, 0, padding, padding, padding, padding, dila, dila, dila, fn, - Convolution_Depthwise); + CONVOLUTION_DEPTHWISE); U32 filterLen = fn * fc * fh * fw; U32 biasLen = oc; @@ -149,7 +149,7 @@ int depthwiseConvolutionTest(int argc, char *argv[], DataFormat filterDataFormat if ((oc & 3) != 0) { U32 ocAlign = (oc + 3) / 4 * 4; U8 *bias_cpu_align = ut_input_v(ocAlign, dt, UT_INIT_ZERO); - memcpy(bias_cpu_align, bias_cpu, oc * bytesOf(dt)); + UNI_MEMCPY(bias_cpu_align, bias_cpu, oc * bytesOf(dt)); free(bias_cpu); bias_cpu = bias_cpu_align; } @@ -172,8 +172,8 @@ int depthwiseConvolutionTest(int argc, char *argv[], DataFormat filterDataFormat CHECK_STATUS(ocl_set_input(handle, input, inputDesc, input_cpu, tmpbuf, true)); - CHECK_STATUS(depthwise_convolution(inputTensor, filterTensor, convParamSpec, alg, biasTensor, - tmp, outputTensor, dwActivationParamSpec, &archInfo)); + CHECK_STATUS(depthwise_convolution(inputTensor, filterTensor, convParamSpec, alg, nullptr, + biasTensor, tmp, outputTensor, dwActivationParamSpec, &archInfo)); /*warp up*/ for (U32 i = 0; i < 2; i++) { @@ -201,18 +201,19 @@ int depthwiseConvolutionTest(int argc, char *argv[], DataFormat filterDataFormat outputDesc.df = DF_NCHW; inputTensorCpu.resize(inputDesc); inputTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc)); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc)); Tensor filterTensorCpu; filterTensorCpu.resize(filterDesc); filterTensorCpu.alloc(); - memcpy( + UNI_MEMCPY( get_ptr_from_tensor(filterTensorCpu, CPU_GENERAL), filter_cpu, tensorNumBytes(filterDesc)); Tensor biasTensorCpu; biasTensorCpu.resize(biasDesc); biasTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(biasTensorCpu, CPU_GENERAL), bias_cpu, tensorNumBytes(biasDesc)); + UNI_MEMCPY(get_ptr_from_tensor(biasTensorCpu, CPU_GENERAL), bias_cpu, tensorNumBytes(biasDesc)); Tensor outputTensorCpu; outputTensorCpu.resize(outputDesc); @@ -226,8 +227,8 @@ int depthwiseConvolutionTest(int argc, char *argv[], DataFormat filterDataFormat tmpTensorCpu.alloc(); CHECK_STATUS(depthwise_convolution(inputTensorCpu, filterTensorCpu, convParamSpec, - DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, biasTensorCpu, tmpTensorCpu, outputTensorCpu, - dwActivationParamSpec, &UT_SERIAL_ARCHINFO)); + DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, nullptr, biasTensorCpu, tmpTensorCpu, + outputTensorCpu, dwActivationParamSpec, &UT_SERIAL_ARCHINFO)); ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, CPU_GENERAL), on * oc * ow * oh, dt); CHECK_STATUS(gcl_finish(handle)); diff --git a/compute/tensor/tests/test_depthwise_pointwise_convolution_ocl.cpp b/compute/tensor/tests/test_depthwise_pointwise_convolution_ocl.cpp index 9b48a4fd..a5ba7d98 100644 --- a/compute/tensor/tests/test_depthwise_pointwise_convolution_ocl.cpp +++ b/compute/tensor/tests/test_depthwise_pointwise_convolution_ocl.cpp @@ -88,7 +88,7 @@ int depthwisePointwiseConvolutionTest( pwActivationParamSpec.mode = ACTIVATION_NULL; ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, 1, fh, fw, 1, stride, stride, 0, 0, pt, pb, pl, pr, dilation, dilation, dilation, fn, - Convolution_Depthwise_Pointwise); + CONVOLUTION_DEPTHWISE_POINTWISE); U32 dwFilterLen = 1 * fc * fh * fw; U32 pwFilterLen = fn * fc * 1 * 1; @@ -181,14 +181,14 @@ int depthwisePointwiseConvolutionTest( if ((ic & 3) != 0) { U32 icAlign = (ic + 3) / 4 * 4; U8 *tmp = ut_input_v(icAlign, dt, UT_INIT_ZERO); - memcpy(tmp, dw_bias_cpu, ic * bytesOf(dt)); + UNI_MEMCPY(tmp, dw_bias_cpu, ic * bytesOf(dt)); free(dw_bias_cpu); dw_bias_cpu = tmp; } alloc_host_ptr(dwBiasTensor, dw_bias_cpu); U8 *pw_bias_val = ut_input_v(oc + 8, dt, UT_INIT_ZERO); - memcpy(pw_bias_val, pw_bias_cpu, oc * bytesOf(dt)); + UNI_MEMCPY(pw_bias_val, pw_bias_cpu, oc * bytesOf(dt)); free(pw_bias_cpu); pw_bias_cpu = pw_bias_val; alloc_host_ptr(pwBiasTensorImg, pw_bias_cpu); @@ -216,7 +216,7 @@ int depthwisePointwiseConvolutionTest( std::vector inputTensors(1, inputTensor); CHECK_STATUS(depthwise_pointwise_convolution(inputTensors, dwFilterTensor, pwFilterTensor, - convParamSpec, alg, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor, + convParamSpec, alg, nullptr, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor, dwActivationParamSpec, pwActivationParamSpec, &archInfo)); /*warp up*/ UNI_INFO_LOG("warm up gpu:\n") @@ -246,30 +246,31 @@ int depthwisePointwiseConvolutionTest( Tensor inputTensorCpu; inputTensorCpu.resize(inputDesc); inputTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc)); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc)); Tensor dwFilterTensorCpu; dwFilterTensorCpu.resize(dwFilterDesc); dwFilterTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(dwFilterTensorCpu, CPU_GENERAL), dw_filter_cpu, + UNI_MEMCPY(get_ptr_from_tensor(dwFilterTensorCpu, CPU_GENERAL), dw_filter_cpu, tensorNumBytes(dwFilterDesc)); Tensor pwFilterTensorCpu; pwFilterTensorCpu.resize(pwFilterDesc); pwFilterTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(pwFilterTensorCpu, CPU_GENERAL), pw_filter_cpu, + UNI_MEMCPY(get_ptr_from_tensor(pwFilterTensorCpu, CPU_GENERAL), pw_filter_cpu, tensorNumBytes(pwFilterDesc)); Tensor dwBiasTensorCpu; dwBiasTensorCpu.resize(dwBiasDesc); dwBiasTensorCpu.alloc(); - memcpy( + UNI_MEMCPY( get_ptr_from_tensor(dwBiasTensorCpu, CPU_GENERAL), dw_bias_cpu, tensorNumBytes(dwBiasDesc)); Tensor pwBiasTensorCpu; pwBiasTensorCpu.resize(pwBiasDesc); pwBiasTensorCpu.alloc(); - memcpy( + UNI_MEMCPY( get_ptr_from_tensor(pwBiasTensorCpu, CPU_GENERAL), pw_bias_cpu, tensorNumBytes(pwBiasDesc)); Tensor outputTensorCpu; @@ -287,8 +288,8 @@ int depthwisePointwiseConvolutionTest( std::vector inputTensorsCpu(1, inputTensorCpu); std::vector tmpTensorsCpu(1, tmpTensorCpu); CHECK_STATUS(depthwise_pointwise_convolution(inputTensorsCpu, dwFilterTensorCpu, - pwFilterTensorCpu, convParamSpec, DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, dwBiasTensorCpu, - pwBiasTensorCpu, tmpTensorsCpu, outputTensorCpu, dwActivationParamSpec, + pwFilterTensorCpu, convParamSpec, DEPTHWISE_CONVOLUTION_ALGORITHM_DIRECT, nullptr, + dwBiasTensorCpu, pwBiasTensorCpu, tmpTensorsCpu, outputTensorCpu, dwActivationParamSpec, pwActivationParamSpec, &UT_SERIAL_ARCHINFO)); ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, CPU_GENERAL), on * oc * ow * oh, dt); diff --git a/compute/tensor/tests/test_detectionoutput.cpp b/compute/tensor/tests/test_detectionoutput.cpp index 3f0e1894..af4ed5c6 100644 --- a/compute/tensor/tests/test_detectionoutput.cpp +++ b/compute/tensor/tests/test_detectionoutput.cpp @@ -57,11 +57,11 @@ int detectionoutputTest(int argc, char **argv, DataType dt) U8 *input_loc = ut_input_v(input_len_loc, dt, UT_INIT_RANDOM); U8 *input_conf = ut_input_v(input_len_conf, dt, UT_INIT_RANDOM); U8 *input_priorbox = ut_input_v(input_len_priorbox, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(inputTensor_loc, CPU_GENERAL), input_loc, + UNI_MEMCPY(get_ptr_from_tensor(inputTensor_loc, CPU_GENERAL), input_loc, tensorNumBytes(inputDesc_loc)); - memcpy(get_ptr_from_tensor(inputTensor_conf, CPU_GENERAL), input_conf, + UNI_MEMCPY(get_ptr_from_tensor(inputTensor_conf, CPU_GENERAL), input_conf, tensorNumBytes(inputDesc_conf)); - memcpy(get_ptr_from_tensor(inputTensor_priorbox, CPU_GENERAL), input_priorbox, + UNI_MEMCPY(get_ptr_from_tensor(inputTensor_priorbox, CPU_GENERAL), input_priorbox, tensorNumBytes(inputDesc_priorbox)); inputTensors[0] = inputTensor_loc; inputTensors[1] = inputTensor_conf; diff --git a/compute/tensor/tests/test_dilated_convolution.cpp b/compute/tensor/tests/test_dilated_convolution.cpp index 1dc29edb..a9b76ef9 100644 --- a/compute/tensor/tests/test_dilated_convolution.cpp +++ b/compute/tensor/tests/test_dilated_convolution.cpp @@ -50,7 +50,7 @@ int dilatedConvolutionTest(int argc, char **argv, DataType dt) TensorDesc filterDesc = tensor4df(dt, DF_NCHW, oc, ic, fh, fw); TensorDesc biasDesc = tensor1d(dt, oc); ConvolutionParamSpec convParamSpec = createConvolutionParamSpec(group, 1, fh, fw, 1, stride, - stride, 0, 0, padding, padding, padding, padding, 1, rate, rate, fn, Convolution_Dilation); + stride, 0, 0, padding, padding, padding, padding, 1, rate, rate, fn, CONVOLUTION_DILATION); // setup input, filter, bias U8 *input = ut_input_v(in * ic * ih * iw, dt, UT_INIT_RANDOM); @@ -76,11 +76,14 @@ int dilatedConvolutionTest(int argc, char **argv, DataType dt) filterTensor.alloc(); filterTensorRef.alloc(); biasTensor.alloc(); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); - memcpy(get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); - memcpy(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, tensorNumBytes(filterDesc)); - memcpy(get_ptr_from_tensor(filterTensorRef, CPU_GENERAL), filter, tensorNumBytes(filterDesc)); - memcpy(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, tensorNumBytes(biasDesc)); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensorRef, CPU_GENERAL), input, bytesOf(dt) * in * ic * ih * iw); + UNI_MEMCPY(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, tensorNumBytes(filterDesc)); + UNI_MEMCPY( + get_ptr_from_tensor(filterTensorRef, CPU_GENERAL), filter, tensorNumBytes(filterDesc)); + UNI_MEMCPY(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, tensorNumBytes(biasDesc)); // setup output, bias CHECK_STATUS(convolution_infer_output_size( diff --git a/compute/tensor/tests/test_eltwise.cpp b/compute/tensor/tests/test_eltwise.cpp index a44af17f..b697aff9 100644 --- a/compute/tensor/tests/test_eltwise.cpp +++ b/compute/tensor/tests/test_eltwise.cpp @@ -28,7 +28,7 @@ int eltwiseTest(int argc, char **argv, DataType dt) U32 len = in * ic * ih * iw; EltwiseMode eltwiseMode = ELTWISE_MAX; EltwiseParamSpec eltwiseDesc; - eltwiseDesc.elt_mode = eltwiseMode; + eltwiseDesc.mode = eltwiseMode; eltwiseDesc.activation_type = ACTIVATION_NULL; std::vector input(num); @@ -40,7 +40,7 @@ int eltwiseTest(int argc, char **argv, DataType dt) input[i] = (void *)ut_input_v(len, dt, UT_INIT_RANDOM); inTensors[i].resize(inDesc); inTensors[i].alloc(); - memcpy(get_ptr_from_tensor(inTensors[i], CPU_GENERAL), input[i], tensorNumBytes(inDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inTensors[i], CPU_GENERAL), input[i], tensorNumBytes(inDesc)); inTensorPtr[i] = &inTensors[i]; } diff --git a/compute/tensor/tests/test_eltwise_ocl.cpp b/compute/tensor/tests/test_eltwise_ocl.cpp index 75a709e9..033a56ce 100644 --- a/compute/tensor/tests/test_eltwise_ocl.cpp +++ b/compute/tensor/tests/test_eltwise_ocl.cpp @@ -53,7 +53,7 @@ int eltwiseTest(int argc, char *argv[], DataType dt) EltwiseMode eltwiseMode = ELTWISE_SUM; EltwiseParamSpec eltwiseDesc; - eltwiseDesc.elt_mode = eltwiseMode; + eltwiseDesc.mode = eltwiseMode; eltwiseDesc.activation_type = ACTIVATION_NULL; std::vector inputCpu(num); @@ -71,7 +71,7 @@ int eltwiseTest(int argc, char *argv[], DataType dt) inTensorsCpu[i].resize(inDesc); } inTensorsCpu[i].alloc(); - memcpy(get_ptr_from_tensor(inTensorsCpu[i], CPU_GENERAL), inputCpu[i], + UNI_MEMCPY(get_ptr_from_tensor(inTensorsCpu[i], CPU_GENERAL), inputCpu[i], tensorNumBytes(inTensorsCpu[i].get_desc())); inTensorPtrCpu[i] = &inTensorsCpu[i]; } diff --git a/compute/tensor/tests/test_expand.cpp b/compute/tensor/tests/test_expand.cpp index 2aedf53e..b8402af1 100644 --- a/compute/tensor/tests/test_expand.cpp +++ b/compute/tensor/tests/test_expand.cpp @@ -28,11 +28,11 @@ int expandTest(int argc, char **argv, DataType dt) U32 oh = atoi(argv[7]); U32 ow = atoi(argv[8]); ExpandParamSpec p; - p.shape_size = 4; - p.shape_dims[0] = on; - p.shape_dims[1] = oc; - p.shape_dims[2] = oh; - p.shape_dims[3] = ow; + p.num_shape = 4; + p.shape[0] = on; + p.shape[1] = oc; + p.shape[2] = oh; + p.shape[3] = ow; DataFormat df = DF_NCHW; TensorDesc inDesc = tensor4df(dt, df, in, ic, ih, iw); @@ -41,7 +41,7 @@ int expandTest(int argc, char **argv, DataType dt) Tensor inputTensor; inputTensor.resize(inDesc); inputTensor.alloc(); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc)); Tensor outputTensor; CHECK_STATUS(expand_infer_output_size(&inputTensor, p, &outputTensor, &UT_CPU_ARCHINFO)); diff --git a/compute/tensor/tests/test_fully_connected.cpp b/compute/tensor/tests/test_fully_connected.cpp index b6b1a847..a54b401e 100644 --- a/compute/tensor/tests/test_fully_connected.cpp +++ b/compute/tensor/tests/test_fully_connected.cpp @@ -29,15 +29,15 @@ int fullyConnectedTest(int argc, char **argv, DataType dt) Tensor inputTensor = Tensor::alloc_sized(inputDesc); U8 *input = ut_input_v(m * k, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc)); Tensor filterTensor = Tensor::alloc_sized(filterDesc); U8 *filter = ut_input_v(k * n, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, tensorNumBytes(filterDesc)); + UNI_MEMCPY(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, tensorNumBytes(filterDesc)); Tensor biasTensor = Tensor::alloc_sized(biasDesc); U8 *bias = ut_input_v(n, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, tensorNumBytes(biasDesc)); + UNI_MEMCPY(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, tensorNumBytes(biasDesc)); // set output Tensor outputTensor; CHECK_STATUS(fully_connected_infer_output_size( diff --git a/compute/tensor/tests/test_fully_connected_int8.cpp b/compute/tensor/tests/test_fully_connected_int8.cpp index c6133896..2709053a 100644 --- a/compute/tensor/tests/test_fully_connected_int8.cpp +++ b/compute/tensor/tests/test_fully_connected_int8.cpp @@ -32,23 +32,24 @@ int fullyConnectedTest(int argc, char **argv, DataType dt, DataType filterDataTy inputTensor.resize(inputDesc); inputTensor.alloc(); U8 *input = ut_input_v(m * k, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc)); filterTensor.resize(filterDesc); filterTensor.alloc(); U8 *filter = ut_input_v(k * n, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, tensorNumBytes(filterDesc)); + UNI_MEMCPY(get_ptr_from_tensor(filterTensor, CPU_GENERAL), filter, tensorNumBytes(filterDesc)); if (m == 1) { filterDescRef.df = DF_NORMAL; } filterTensorRef.resize(filterDescRef); filterTensorRef.alloc(); - memcpy(get_ptr_from_tensor(filterTensorRef, CPU_GENERAL), filter, tensorNumBytes(filterDescRef)); + UNI_MEMCPY( + get_ptr_from_tensor(filterTensorRef, CPU_GENERAL), filter, tensorNumBytes(filterDescRef)); biasTensor.resize(biasDesc); biasTensor.alloc(); U8 *bias = ut_input_v(n, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, tensorNumBytes(biasDesc)); + UNI_MEMCPY(get_ptr_from_tensor(biasTensor, CPU_GENERAL), bias, tensorNumBytes(biasDesc)); // set output Tensor outputTensor, outputTensorRef; diff --git a/compute/tensor/tests/test_fully_connected_ocl.cpp b/compute/tensor/tests/test_fully_connected_ocl.cpp index 268fed5c..5be67b1b 100644 --- a/compute/tensor/tests/test_fully_connected_ocl.cpp +++ b/compute/tensor/tests/test_fully_connected_ocl.cpp @@ -111,7 +111,7 @@ int fullyConnectedTest(int argc, char *argv[], DataType dt) biasNum = (fn + item_m - 1) / item_m * item_m; if (biasNum > fn) { U8 *bias_val = ut_input_v(biasNum, dt, UT_INIT_ZERO); - memcpy(bias_val, bias_cpu, fn * bytesOf(dt)); + UNI_MEMCPY(bias_val, bias_cpu, fn * bytesOf(dt)); free(bias_cpu); bias_cpu = bias_val; } @@ -169,18 +169,19 @@ int fullyConnectedTest(int argc, char *argv[], DataType dt) Tensor inputTensorCpu; inputTensorCpu.resize(inputDesc); inputTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc)); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc)); Tensor filterTensorCpu; filterTensorCpu.resize(filterDesc); filterTensorCpu.alloc(); - memcpy( + UNI_MEMCPY( get_ptr_from_tensor(filterTensorCpu, CPU_GENERAL), filter_cpu, tensorNumBytes(filterDesc)); Tensor biasTensorCpu; biasTensorCpu.resize(biasDesc); biasTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(biasTensorCpu, CPU_GENERAL), bias_cpu, tensorNumBytes(biasDesc)); + UNI_MEMCPY(get_ptr_from_tensor(biasTensorCpu, CPU_GENERAL), bias_cpu, tensorNumBytes(biasDesc)); Tensor outputTensorCpu; outputTensorCpu.resize(outputDesc_cpu); diff --git a/compute/tensor/tests/test_gather_ocl.cpp b/compute/tensor/tests/test_gather_ocl.cpp index 41e08d93..684ac396 100644 --- a/compute/tensor/tests/test_gather_ocl.cpp +++ b/compute/tensor/tests/test_gather_ocl.cpp @@ -75,10 +75,12 @@ int gatherTest(int argc, char **argv, DataType dt) Tensor inputTensorCpu, indexTensorCpu, outputTensorCpu; inputTensorCpu.resize(inputDesc); inputTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), inputCpu, tensorNumBytes(inputDesc)); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), inputCpu, tensorNumBytes(inputDesc)); indexTensorCpu.resize(indexDesc); indexTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(indexTensorCpu, CPU_GENERAL), indexCpu, tensorNumBytes(indexDesc)); + UNI_MEMCPY( + get_ptr_from_tensor(indexTensorCpu, CPU_GENERAL), indexCpu, tensorNumBytes(indexDesc)); CHECK_STATUS(gather_infer_output_size( &inputTensorCpu, &indexTensorCpu, p, &outputTensorCpu, &UT_SERIAL_ARCHINFO)); outputTensorCpu.alloc(); diff --git a/compute/tensor/tests/test_l2normalization.cpp b/compute/tensor/tests/test_l2normalization.cpp index d4cb317b..61f540d0 100644 --- a/compute/tensor/tests/test_l2normalization.cpp +++ b/compute/tensor/tests/test_l2normalization.cpp @@ -29,7 +29,7 @@ int l2normalizationTest(int argc, char **argv, DataType dt) Tensor inputTensor; inputTensor.resize(inputDesc); inputTensor.alloc(); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc)); // set output Tensor outputTensor, outputTensorRef; diff --git a/compute/tensor/tests/test_matmul_int8.cpp b/compute/tensor/tests/test_matmul_int8.cpp index dabb4ac4..1cdaa6a8 100644 --- a/compute/tensor/tests/test_matmul_int8.cpp +++ b/compute/tensor/tests/test_matmul_int8.cpp @@ -55,10 +55,12 @@ int MatmulTest(int argc, char **argv, DataType dt, DataType filterDataType) } matrixBTensor.set_scale(1); matrixBTensorRef.set_scale(1); - memcpy(get_ptr_from_tensor(matrixATensor, CPU_GENERAL), A, tensorNumBytes(matrixADesc)); - memcpy(get_ptr_from_tensor(matrixATensorRef, CPU_GENERAL), ARef, tensorNumBytes(matrixADescRef)); - memcpy(get_ptr_from_tensor(matrixBTensor, CPU_GENERAL), B, tensorNumBytes(matrixBDesc)); - memcpy(get_ptr_from_tensor(matrixBTensorRef, CPU_GENERAL), BRef, tensorNumBytes(matrixBDescRef)); + UNI_MEMCPY(get_ptr_from_tensor(matrixATensor, CPU_GENERAL), A, tensorNumBytes(matrixADesc)); + UNI_MEMCPY( + get_ptr_from_tensor(matrixATensorRef, CPU_GENERAL), ARef, tensorNumBytes(matrixADescRef)); + UNI_MEMCPY(get_ptr_from_tensor(matrixBTensor, CPU_GENERAL), B, tensorNumBytes(matrixBDesc)); + UNI_MEMCPY( + get_ptr_from_tensor(matrixBTensorRef, CPU_GENERAL), BRef, tensorNumBytes(matrixBDescRef)); bool transposeA = (matrixADesc.df == DF_TRANSPOSE); bool transposeB = (matrixBDesc.df == DF_TRANSPOSE); diff --git a/compute/tensor/tests/test_matmul_ocl.cpp b/compute/tensor/tests/test_matmul_ocl.cpp index d71d96af..4cd759bc 100644 --- a/compute/tensor/tests/test_matmul_ocl.cpp +++ b/compute/tensor/tests/test_matmul_ocl.cpp @@ -194,13 +194,13 @@ int matmulTest(int argc, char *argv[], DataType dt) Tensor matrixATensorCpu; matrixATensorCpu.resize(matrixADesc); matrixATensorCpu.alloc(); - memcpy(get_ptr_from_tensor(matrixATensorCpu, CPU_GENERAL), matrixA_cpu, + UNI_MEMCPY(get_ptr_from_tensor(matrixATensorCpu, CPU_GENERAL), matrixA_cpu, tensorNumBytes(matrixADesc)); Tensor matrixBTensorCpu; matrixBTensorCpu.resize(matrixBDesc); matrixBTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(matrixBTensorCpu, CPU_GENERAL), matrixB_cpu, + UNI_MEMCPY(get_ptr_from_tensor(matrixBTensorCpu, CPU_GENERAL), matrixB_cpu, tensorNumBytes(matrixBDesc)); Tensor matrixCTensorCpu; diff --git a/compute/tensor/tests/test_matmul_ocl_f32.cpp b/compute/tensor/tests/test_matmul_ocl_f32.cpp index e34a0593..57ca760b 100644 --- a/compute/tensor/tests/test_matmul_ocl_f32.cpp +++ b/compute/tensor/tests/test_matmul_ocl_f32.cpp @@ -32,13 +32,13 @@ inline U8 *matmulF32Cpu(TensorDesc matrixADesc, Tensor matrixATensorCpu; matrixATensorCpu.resize(matrixADesc); matrixATensorCpu.alloc(); - memcpy(get_ptr_from_tensor(matrixATensorCpu, CPU_GENERAL), matrixA_cpu, + UNI_MEMCPY(get_ptr_from_tensor(matrixATensorCpu, CPU_GENERAL), matrixA_cpu, tensorNumBytes(matrixADesc)); Tensor matrixBTensorCpu; matrixBTensorCpu.resize(matrixBDesc); matrixBTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(matrixBTensorCpu, CPU_GENERAL), matrixB_cpu, + UNI_MEMCPY(get_ptr_from_tensor(matrixBTensorCpu, CPU_GENERAL), matrixB_cpu, tensorNumBytes(matrixBDesc)); CHECK_STATUS(matmul_infer_output_size(&matrixATensorCpu, transposeA, &matrixBTensorCpu, diff --git a/compute/tensor/tests/test_non_max_suppression.cpp b/compute/tensor/tests/test_non_max_suppression.cpp index 0a3a5353..b4147d05 100644 --- a/compute/tensor/tests/test_non_max_suppression.cpp +++ b/compute/tensor/tests/test_non_max_suppression.cpp @@ -25,9 +25,6 @@ int nonmaxsuppressionTest(int argc, char **argv, DataType dt) U32 in1 = atoi(argv[4]); U32 ic1 = atoi(argv[5]); U32 ilens1 = atoi(argv[6]); - // output - U32 oh = atoi(argv[7]); - U32 ow = atoi(argv[8]); // nonMaxSuppressionParamSpec U32 max_output_boxes_per_class = atoi(argv[9]); F32 iou_threshold = (F32)atof(argv[10]); @@ -45,11 +42,11 @@ int nonmaxsuppressionTest(int argc, char **argv, DataType dt) inputTensors[1] = Tensor::alloc_sized(input_desc_scores); U32 input_len_boxes = tensorNumElements(input_desc_boxes); U8 *input_boxes = ut_input_v(input_len_boxes, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(inputTensors[0], CPU_GENERAL), input_boxes, + UNI_MEMCPY(get_ptr_from_tensor(inputTensors[0], CPU_GENERAL), input_boxes, tensorNumBytes(input_desc_boxes)); U32 input_len_scores = tensorNumElements(input_desc_scores); U8 *input_scores = ut_input_v(input_len_scores, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(inputTensors[1], CPU_GENERAL), input_scores, + UNI_MEMCPY(get_ptr_from_tensor(inputTensors[1], CPU_GENERAL), input_scores, tensorNumBytes(input_desc_scores)); std::vector inputTensorsPtr(2); inputTensorsPtr[0] = &inputTensors[0]; @@ -60,9 +57,8 @@ int nonmaxsuppressionTest(int argc, char **argv, DataType dt) inputTensorsPtr, nonMaxSuppressionParamSpec, &outputTensor, &UT_CPU_ARCHINFO)); outputTensor.alloc(); Tensor outputTensorRef = Tensor::alloc_sized(outputTensor.get_desc()); - U32 output_len = outputTensor.length(); - CHECK_REQUIREMENT(input_len_boxes == in0 * ic0 * ilens0 && - input_len_scores == in1 * ic1 * ilens1 && output_len == oh * ow); + CHECK_REQUIREMENT( + input_len_boxes == in0 * ic0 * ilens0 && input_len_scores == in1 * ic1 * ilens1); /* You can also change codes and use datas in the following example. Command: ./test_non_max_suppression 1 6 4 1 2 6 7 3 3 0.5 0 @@ -90,35 +86,16 @@ int nonmaxsuppressionTest(int argc, char **argv, DataType dt) inputTensors, nonMaxSuppressionParamSpec, outputTensorRef, &UT_SERIAL_ARCHINFO)); // check ut_check_v(get_ptr_from_tensor(outputTensor, CPU_GENERAL), - get_ptr_from_tensor(outputTensorRef, CPU_GENERAL), output_len, dt, 0.05, __FILE__, - __LINE__); + get_ptr_from_tensor(outputTensorRef, CPU_GENERAL), outputTensor.length(), dt, 0.05, + __FILE__, __LINE__); } - U32 num_detected_max = max_output_boxes_per_class * ic1; - if (dt == DT_F32) { - F32 *output_f32 = (F32 *)get_ptr_from_tensor(outputTensor, CPU_GENERAL); - int idx = 0; - for (U32 i = 0; i < 1 + num_detected_max; i++) { - for (int j = 0; j < 3; j++) { - printf("%d:%f ", j, output_f32[idx + j]); - } - printf("\n"); - idx = idx + 3; - } + TensorDesc outputDesc = outputTensor.get_desc(); + I32 *out = (I32 *)get_ptr_from_tensor(outputTensor, CPU_GENERAL); + U32 num_detected = outputDesc.dims[1]; + for (U32 i = 0; i < num_detected; i++) { + printf("(%d, %d, %d)\n", out[i * 3], out[i * 3 + 1], out[i * 3 + 2]); } -#ifdef _USE_FP16 - if (dt == DT_F16) { - F16 *output_f16 = (F16 *)get_ptr_from_tensor(outputTensorRef, CPU_GENERAL); - int idx = 0; - for (U32 i = 0; i < 1 + num_detected_max; i++) { - for (int j = 0; j < 3; j++) { - printf("%d:%f ", j + 1, output_f16[idx + j]); - } - printf("\n"); - idx = idx + 3; - } - } -#endif free(input_boxes); free(input_scores); return 0; diff --git a/compute/tensor/tests/test_normalization.cpp b/compute/tensor/tests/test_normalization.cpp index 63337cbe..22aa5dcb 100644 --- a/compute/tensor/tests/test_normalization.cpp +++ b/compute/tensor/tests/test_normalization.cpp @@ -22,6 +22,8 @@ int normalizationTest(int argc, char **argv, DataType dt) U32 ic = atoi(argv[3]); U32 ih = atoi(argv[4]); U32 iw = atoi(argv[5]); + LayerNormParamSpec p; + p.axis = -1; DataFormat df = DF_MTK; Tensor inputTensor; @@ -30,7 +32,7 @@ int normalizationTest(int argc, char **argv, DataType dt) inputTensor.alloc(); U32 input_len = tensorNumElements(inputDesc); U8 *input = ut_input_v(input_len, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc)); // set output Tensor outputTensor, outputTensorRef; @@ -56,16 +58,16 @@ int normalizationTest(int argc, char **argv, DataType dt) betaTensor.resize(betaDesc); alphaTensor.alloc(); betaTensor.alloc(); - memcpy(get_ptr_from_tensor(alphaTensor, CPU_GENERAL), alpha_list, tensorNumBytes(alphaDesc)); - memcpy(get_ptr_from_tensor(betaTensor, CPU_GENERAL), beta_list, tensorNumBytes(betaDesc)); + UNI_MEMCPY(get_ptr_from_tensor(alphaTensor, CPU_GENERAL), alpha_list, tensorNumBytes(alphaDesc)); + UNI_MEMCPY(get_ptr_from_tensor(betaTensor, CPU_GENERAL), beta_list, tensorNumBytes(betaDesc)); if (UT_CHECK) { CHECK_STATUS(layer_normalization( - inputTensor, alphaTensor, betaTensor, tmpTensor, outputTensor, &UT_CPU_ARCHINFO)); + inputTensor, p, alphaTensor, betaTensor, tmpTensor, outputTensor, &UT_CPU_ARCHINFO)); // naive implement - CHECK_STATUS(layer_normalization( - inputTensor, alphaTensor, betaTensor, tmpTensor, outputTensorRef, &UT_SERIAL_ARCHINFO)); + CHECK_STATUS(layer_normalization(inputTensor, p, alphaTensor, betaTensor, tmpTensor, + outputTensorRef, &UT_SERIAL_ARCHINFO)); // check ut_check_v(get_ptr_from_tensor(outputTensor, CPU_GENERAL), @@ -77,7 +79,7 @@ int normalizationTest(int argc, char **argv, DataType dt) double time_start = ut_time_ms(); for (int iter = 0; iter < UT_LOOPS; iter++) { CHECK_STATUS(layer_normalization( - inputTensor, alphaTensor, betaTensor, tmpTensor, outputTensor, &UT_CPU_ARCHINFO)); + inputTensor, p, alphaTensor, betaTensor, tmpTensor, outputTensor, &UT_CPU_ARCHINFO)); } double time_end = ut_time_ms(); double time = (time_end - time_start) / UT_LOOPS; diff --git a/compute/tensor/tests/test_padding.cpp b/compute/tensor/tests/test_padding.cpp index c6ddf9ea..e310c2bb 100644 --- a/compute/tensor/tests/test_padding.cpp +++ b/compute/tensor/tests/test_padding.cpp @@ -52,20 +52,20 @@ int paddingTest(int argc, char **argv, DataType dt) padParamSpec.constant_value = 0.0; switch (mode) { case 0: { - padParamSpec.pad_mode = Pad_Constant; + padParamSpec.pad_mode = PAD_CONSTANT; break; } case 1: { - padParamSpec.pad_mode = Pad_Edge; + padParamSpec.pad_mode = PAD_EDGE; break; } case 2: { // limitation: the h_fir and the h_sec should lower than 0 - padParamSpec.pad_mode = Pad_Reflect; + padParamSpec.pad_mode = PAD_REFLECT; break; } case 3: { - padParamSpec.pad_mode = Pad_Symmetric; + padParamSpec.pad_mode = PAD_SYMMETRIC; break; } default: { @@ -80,7 +80,7 @@ int paddingTest(int argc, char **argv, DataType dt) inputTensor.alloc(); U32 input_len = tensorNumElements(inputDesc); U8 *input = ut_input_v(input_len, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc)); // set output Tensor outputTensor, outputTensorRef; diff --git a/compute/tensor/tests/test_padding_ocl.cpp b/compute/tensor/tests/test_padding_ocl.cpp index 677325bd..df8db6a2 100644 --- a/compute/tensor/tests/test_padding_ocl.cpp +++ b/compute/tensor/tests/test_padding_ocl.cpp @@ -44,20 +44,20 @@ int paddingTest(int argc, char **argv, DataType dt) padParamSpec.constant_value = 0.0; switch (mode) { case 0: { - padParamSpec.pad_mode = Pad_Constant; + padParamSpec.pad_mode = PAD_CONSTANT; break; } case 1: { - padParamSpec.pad_mode = Pad_Edge; + padParamSpec.pad_mode = PAD_EDGE; break; } case 2: { // limitation: the h_fir and the h_sec should lower than 0 - padParamSpec.pad_mode = Pad_Reflect; + padParamSpec.pad_mode = PAD_REFLECT; break; } case 3: { - padParamSpec.pad_mode = Pad_Symmetric; + padParamSpec.pad_mode = PAD_SYMMETRIC; break; } default: { @@ -135,7 +135,8 @@ int paddingTest(int argc, char **argv, DataType dt) Tensor inputTensorCpu; inputTensorCpu.resize(inputDescCPU); inputTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), inputCPU, tensorNumBytes(inputDescCPU)); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), inputCPU, tensorNumBytes(inputDescCPU)); Tensor outputTensorCpu; CHECK_STATUS(padding_infer_output_size( diff --git a/compute/tensor/tests/test_pooling.cpp b/compute/tensor/tests/test_pooling.cpp index 67555d0e..07055813 100644 --- a/compute/tensor/tests/test_pooling.cpp +++ b/compute/tensor/tests/test_pooling.cpp @@ -26,19 +26,19 @@ int poolingTest(int argc, char **argv, DataType dt) PoolingParamSpec p; p.mode = POOLING_MAX; - p.rm = CEIL; + p.round_mode = ROUND_CEIL; p.kernel_t = atoi(argv[6]); p.kernel_h = atoi(argv[7]); p.kernel_w = atoi(argv[8]); p.stride_t = atoi(argv[9]); p.stride_h = atoi(argv[10]); p.stride_w = atoi(argv[11]); - p.padding_before = atoi(argv[12]); - p.padding_after = atoi(argv[13]); - p.padding_top = atoi(argv[14]); - p.padding_bottom = atoi(argv[15]); - p.padding_left = atoi(argv[16]); - p.padding_right = atoi(argv[17]); + p.pad_before = atoi(argv[12]); + p.pad_after = atoi(argv[13]); + p.pad_top = atoi(argv[14]); + p.pad_bottom = atoi(argv[15]); + p.pad_left = atoi(argv[16]); + p.pad_right = atoi(argv[17]); TensorDesc inputDesc; if (it == 1) { @@ -49,7 +49,7 @@ int poolingTest(int argc, char **argv, DataType dt) Tensor inputTensor = Tensor::alloc_sized(inputDesc); U32 input_len = inputTensor.length(); U8 *input = ut_input_v(input_len, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, inputTensor.bytes()); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, inputTensor.bytes()); // set output Tensor outputTensor; diff --git a/compute/tensor/tests/test_pooling_bp.cpp b/compute/tensor/tests/test_pooling_bp.cpp index 34932ab8..c94a5cc4 100644 --- a/compute/tensor/tests/test_pooling_bp.cpp +++ b/compute/tensor/tests/test_pooling_bp.cpp @@ -26,19 +26,19 @@ int poolingbpTest(int argc, char **argv, DataType dt) PoolingParamSpec p; p.mode = POOLING_MEAN; - p.rm = CEIL; + p.round_mode = ROUND_CEIL; p.kernel_t = atoi(argv[6]); p.kernel_h = atoi(argv[7]); p.kernel_w = atoi(argv[8]); p.stride_t = atoi(argv[9]); p.stride_h = atoi(argv[10]); p.stride_w = atoi(argv[11]); - p.padding_before = atoi(argv[12]); - p.padding_after = atoi(argv[13]); - p.padding_top = atoi(argv[14]); - p.padding_bottom = atoi(argv[15]); - p.padding_left = atoi(argv[16]); - p.padding_right = atoi(argv[17]); + p.pad_before = atoi(argv[12]); + p.pad_after = atoi(argv[13]); + p.pad_top = atoi(argv[14]); + p.pad_bottom = atoi(argv[15]); + p.pad_left = atoi(argv[16]); + p.pad_right = atoi(argv[17]); TensorDesc inputDesc; if (it == 1) { @@ -49,7 +49,7 @@ int poolingbpTest(int argc, char **argv, DataType dt) Tensor inputTensor = Tensor::alloc_sized(inputDesc); U32 input_len = inputTensor.length(); U8 *input = ut_input_v(input_len, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, inputTensor.bytes()); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, inputTensor.bytes()); // set output Tensor outputTensor; diff --git a/compute/tensor/tests/test_pooling_int8.cpp b/compute/tensor/tests/test_pooling_int8.cpp index 05634f24..767d7c84 100644 --- a/compute/tensor/tests/test_pooling_int8.cpp +++ b/compute/tensor/tests/test_pooling_int8.cpp @@ -27,19 +27,19 @@ int int8PoolingTest(int argc, char **argv, DataType dt) PoolingParamSpec p; p.mode = POOLING_MEAN; - p.rm = CEIL; + p.round_mode = ROUND_CEIL; p.kernel_t = atoi(argv[6]); p.kernel_h = atoi(argv[7]); p.kernel_w = atoi(argv[8]); p.stride_t = atoi(argv[9]); p.stride_h = atoi(argv[10]); p.stride_w = atoi(argv[11]); - p.padding_before = atoi(argv[12]); - p.padding_after = atoi(argv[13]); - p.padding_top = atoi(argv[14]); - p.padding_bottom = atoi(argv[15]); - p.padding_left = atoi(argv[16]); - p.padding_right = atoi(argv[17]); + p.pad_before = atoi(argv[12]); + p.pad_after = atoi(argv[13]); + p.pad_top = atoi(argv[14]); + p.pad_bottom = atoi(argv[15]); + p.pad_left = atoi(argv[16]); + p.pad_right = atoi(argv[17]); TensorDesc inputDesc = tensor4df(DT_I8, DF_NCHWC8, in, ic, ih, iw); TensorDesc inputDescRef = inputDesc; diff --git a/compute/tensor/tests/test_pooling_ocl.cpp b/compute/tensor/tests/test_pooling_ocl.cpp index a9ceee44..71c8cdb6 100644 --- a/compute/tensor/tests/test_pooling_ocl.cpp +++ b/compute/tensor/tests/test_pooling_ocl.cpp @@ -47,19 +47,19 @@ int poolingTest(int argc, char **argv, DataType dt) PoolingParamSpec p; p.mode = POOLING_MEAN; - p.rm = CEIL; + p.round_mode = ROUND_CEIL; p.kernel_t = atoi(argv[6]); p.kernel_h = atoi(argv[7]); p.kernel_w = atoi(argv[8]); p.stride_t = atoi(argv[9]); p.stride_h = atoi(argv[10]); p.stride_w = atoi(argv[11]); - p.padding_before = atoi(argv[12]); - p.padding_after = atoi(argv[13]); - p.padding_top = atoi(argv[14]); - p.padding_bottom = atoi(argv[15]); - p.padding_left = atoi(argv[16]); - p.padding_right = atoi(argv[17]); + p.pad_before = atoi(argv[12]); + p.pad_after = atoi(argv[13]); + p.pad_top = atoi(argv[14]); + p.pad_bottom = atoi(argv[15]); + p.pad_left = atoi(argv[16]); + p.pad_right = atoi(argv[17]); ArchInfo archInfo; archInfo.arch = MALI; @@ -84,7 +84,7 @@ int poolingTest(int argc, char **argv, DataType dt) Tensor inputTensorCpu; inputTensorCpu.resize(inputDescCpu); inputTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu_nchwc8, + UNI_MEMCPY(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu_nchwc8, tensorNumBytes(inputDescCpu)); Tensor outputTensorCpu; diff --git a/compute/tensor/tests/test_power.cpp b/compute/tensor/tests/test_power.cpp index a61371fc..e4146da4 100644 --- a/compute/tensor/tests/test_power.cpp +++ b/compute/tensor/tests/test_power.cpp @@ -28,10 +28,10 @@ int powerTest(int argc, char **argv, DataType dt) inputTensor.resize(inputDesc); inputTensor.alloc(); U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc)); // set output Tensor outputTensor, outputTensorRef; - CHECK_STATUS(power_infer_output_size(&inputTensor, &outputTensor, &UT_CPU_ARCHINFO)); + CHECK_STATUS(power_infer_output_size(&inputTensor, p, &outputTensor, &UT_CPU_ARCHINFO)); outputTensor.alloc(); TensorDesc outputDesc_ref = outputTensor.get_desc(); outputTensorRef.resize(outputDesc_ref); diff --git a/compute/tensor/tests/test_power_ocl.cpp b/compute/tensor/tests/test_power_ocl.cpp index bcc3dc70..009f041b 100644 --- a/compute/tensor/tests/test_power_ocl.cpp +++ b/compute/tensor/tests/test_power_ocl.cpp @@ -61,7 +61,7 @@ int powerTest(int argc, char **argv, DataType dt) MaliPara maliPara; maliPara.handle = handle; archInfo.archPara = &maliPara; - CHECK_STATUS(power_infer_output_size(&inputTensor, &outputTensor, &archInfo)); + CHECK_STATUS(power_infer_output_size(&inputTensor, p, &outputTensor, &archInfo)); TensorDesc output_desc_gpu = outputTensor.get_desc(); U8 *output_gpu = ut_input_v(on * oc * oh * ow, dt, UT_INIT_RANDOM); @@ -106,7 +106,7 @@ int powerTest(int argc, char **argv, DataType dt) Tensor inputTensorCpu; inputTensorCpu.resize(input_desc_cpu); inputTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, + UNI_MEMCPY(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(input_desc_cpu)); Tensor outputTensorCpu; diff --git a/compute/tensor/tests/test_prelu.cpp b/compute/tensor/tests/test_prelu.cpp index e5536df0..cbd2c109 100644 --- a/compute/tensor/tests/test_prelu.cpp +++ b/compute/tensor/tests/test_prelu.cpp @@ -35,8 +35,8 @@ int preluTest(int argc, char **argv, DataType dt) Tensor inputTensor = Tensor::alloc_sized(inputDesc); Tensor weightTensor = Tensor::alloc_sized(weightDesc); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc)); - memcpy(get_ptr_from_tensor(weightTensor, CPU_GENERAL), weight, tensorNumBytes(weightDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc)); + UNI_MEMCPY(get_ptr_from_tensor(weightTensor, CPU_GENERAL), weight, tensorNumBytes(weightDesc)); // set output Tensor outputTensor; diff --git a/compute/tensor/tests/test_prelu_ocl.cpp b/compute/tensor/tests/test_prelu_ocl.cpp index 473803b5..a6c2a991 100644 --- a/compute/tensor/tests/test_prelu_ocl.cpp +++ b/compute/tensor/tests/test_prelu_ocl.cpp @@ -70,7 +70,7 @@ int preluTest(int argc, char **argv, DataType dt) U32 icAlign = (ic + 3) / 4 * 4; if (!preluDesc.propagate_down) { U8 *weightAlign = ut_input_v(icAlign, dt, UT_INIT_ZERO); - memcpy(weightAlign, weightCPU, ic * bytesOf(dt)); + UNI_MEMCPY(weightAlign, weightCPU, ic * bytesOf(dt)); free(weightCPU); weightCPU = weightAlign; alloc_padding(weightTensor, 0, icAlign - ic, 0, 0, weightCPU); diff --git a/compute/tensor/tests/test_reduction.cpp b/compute/tensor/tests/test_reduction.cpp index 327af183..1fda5cf1 100644 --- a/compute/tensor/tests/test_reduction.cpp +++ b/compute/tensor/tests/test_reduction.cpp @@ -22,11 +22,11 @@ int reductionTest(int argc, char **argv, DataType dt) U32 ih = atoi(argv[3]); U32 iw = atoi(argv[4]); ReductionParamSpec p; - p.axes_num = atoi(argv[5]); - for (int i = 0; i < p.axes_num; i++) { + p.num_axes = atoi(argv[5]); + for (int i = 0; i < p.num_axes; i++) { p.axes[i] = atoi(argv[6 + i]); } - p.reduction_mode = REDUCTION_MEAN; + p.mode = REDUCTION_MEAN; p.coeff = 1.0; p.keep_dim = true; DataFormat df = DF_NCHW; @@ -38,7 +38,7 @@ int reductionTest(int argc, char **argv, DataType dt) Tensor inputTensor; inputTensor.resize(inDesc); inputTensor.alloc(); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc)); Tensor maskTensor; maskTensor.resize(maskDesc); @@ -85,7 +85,7 @@ int reductionTest(int argc, char **argv, DataType dt) CHECK_STATUS(tensor4dGet(outputTensor.get_desc(), &dt, &df, &on, &oc, &oh, &ow)); char buffer[150]; char params[120]; - sprintf(params, "(%u %u %u %u) %d =(%u %u %u %u)", in, ic, ih, iw, p.axes_num, on, oc, oh, ow); + sprintf(params, "(%u %u %u %u) %d =(%u %u %u %u)", in, ic, ih, iw, p.num_axes, on, oc, oh, ow); sprintf(buffer, "%20s, %80s", "Reduction", params); double ops = 1.0 * in * ic * ih * iw; ut_log(dt, buffer, ops, time / UT_LOOPS); diff --git a/compute/tensor/tests/test_reduction_ocl.cpp b/compute/tensor/tests/test_reduction_ocl.cpp index b7a67d33..9d2bb0a9 100644 --- a/compute/tensor/tests/test_reduction_ocl.cpp +++ b/compute/tensor/tests/test_reduction_ocl.cpp @@ -24,11 +24,11 @@ int reductionTest(int argc, char **argv, DataType dt) ReductionParamSpec p; p.keep_dim = atoi(argv[5]); bool use_c4 = atoi(argv[6]); - p.axes_num = atoi(argv[7]); - for (int i = 0; i < p.axes_num; i++) { + p.num_axes = atoi(argv[7]); + for (int i = 0; i < p.num_axes; i++) { p.axes[i] = atoi(argv[8 + i]); } - p.reduction_mode = REDUCTION_MEAN; + p.mode = REDUCTION_MEAN; p.coeff = 1.0; TensorDesc maskDesc; maskDesc.nDims = 0; @@ -45,7 +45,8 @@ int reductionTest(int argc, char **argv, DataType dt) Tensor inputTensorCpu; inputTensorCpu.resize(inputDesc); inputTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc)); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc)); Tensor maskTensorCpu; maskTensorCpu.resize(maskDesc); @@ -117,7 +118,7 @@ int reductionTest(int argc, char **argv, DataType dt) char buffer[150]; char params[120]; - sprintf(params, "(%u %u %u %u) %d =(%u %u %u %u)", in, ic, ih, iw, p.axes_num, on, oc, oh, ow); + sprintf(params, "(%u %u %u %u) %d =(%u %u %u %u)", in, ic, ih, iw, p.num_axes, on, oc, oh, ow); sprintf(buffer, "%20s, %80s", "Reduction", params); #ifdef _DEBUG double ops = len; diff --git a/compute/tensor/tests/test_reshape.cpp b/compute/tensor/tests/test_reshape.cpp index 2bf39380..0d73239b 100644 --- a/compute/tensor/tests/test_reshape.cpp +++ b/compute/tensor/tests/test_reshape.cpp @@ -24,10 +24,10 @@ int reshapeTest(int argc, char **argv, DataType dt) U32 ih = atoi(argv[3]); U32 iw = atoi(argv[4]); ReshapeParamSpec p; - p.shape_size = atoi(argv[5]); - CHECK_REQUIREMENT(argc == 6 + p.shape_size); - for (I32 i = 0; i < p.shape_size; i++) { - p.shape_dims[i] = atoi(argv[6 + i]); + p.num_shape = atoi(argv[5]); + CHECK_REQUIREMENT(argc == 6 + p.num_shape); + for (I32 i = 0; i < p.num_shape; i++) { + p.shape[i] = atoi(argv[6 + i]); } DataFormat df = DF_NCHW; @@ -37,7 +37,7 @@ int reshapeTest(int argc, char **argv, DataType dt) Tensor inputTensor; inputTensor.resize(inDesc); inputTensor.alloc(); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc)); Tensor outputTensor; CHECK_STATUS(reshape_infer_output_size(&inputTensor, p, &outputTensor, &UT_CPU_ARCHINFO)); @@ -61,16 +61,16 @@ int reshapeTest(int argc, char **argv, DataType dt) // log performance data char buffer[150]; char params[120]; - memset(params, 0, 120); + UNI_MEMSET(params, 0, 120); sprintf(params, "(%u %u %u %u)=(", in, ic, ih, iw); - for (I32 i = 0; i < p.shape_size; i++) { + for (I32 i = 0; i < p.num_shape; i++) { I32 index = 0; for (; index < 120; index++) { if (params[index] == '\0') { break; } } - if (i != p.shape_size - 1) { + if (i != p.num_shape - 1) { sprintf(params + index, "%d ", outDesc.dims[outDesc.nDims - 1 - i]); } else { sprintf(params + index, "%d)", outDesc.dims[outDesc.nDims - 1 - i]); diff --git a/compute/tensor/tests/test_reshape_ocl.cpp b/compute/tensor/tests/test_reshape_ocl.cpp index f17b15f3..b7d5fde6 100644 --- a/compute/tensor/tests/test_reshape_ocl.cpp +++ b/compute/tensor/tests/test_reshape_ocl.cpp @@ -24,9 +24,9 @@ int reshapeTest(int argc, char **argv, DataType dt) for (U32 i = 0; i < inputDesc.nDims; i++) { inputDesc.dims[inputDesc.nDims - i - 1] = atoi(argv[i + 2]); } - p.shape_size = atoi(argv[inputDesc.nDims + 2]); - for (I32 i = 0; i < p.shape_size; i++) { - p.shape_dims[i] = atoi(argv[i + inputDesc.nDims + 3]); + p.num_shape = atoi(argv[inputDesc.nDims + 2]); + for (I32 i = 0; i < p.num_shape; i++) { + p.shape[i] = atoi(argv[i + inputDesc.nDims + 3]); } ArchInfo archInfo; @@ -38,7 +38,8 @@ int reshapeTest(int argc, char **argv, DataType dt) Tensor inputTensorCpu; inputTensorCpu.resize(inputDesc); inputTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc)); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc)); Tensor outputTensorCpu; Tensor tmpTensorCpu; @@ -98,7 +99,7 @@ int reshapeTest(int argc, char **argv, DataType dt) char buffer[150]; char params[120]; - memset(params, 0, 120); + UNI_MEMSET(params, 0, 120); sprintf(params, "("); for (U32 i = 0; i < inputDesc.nDims; i++) { if (i != inputDesc.nDims - 1) { @@ -107,14 +108,14 @@ int reshapeTest(int argc, char **argv, DataType dt) sprintf(params + i * 2 + 1, "%d) = (", inputDesc.dims[inputDesc.nDims - 1 - i]); } } - for (I32 i = 0; i < p.shape_size; i++) { + for (I32 i = 0; i < p.num_shape; i++) { I32 index = 0; for (; index < 120; index++) { if (params[index] == '\0') { break; } } - if (i != p.shape_size - 1) { + if (i != p.num_shape - 1) { sprintf(params + index, "%d ", outputDesc.dims[outputDesc.nDims - 1 - i]); } else { sprintf(params + index, "%d)", outputDesc.dims[outputDesc.nDims - 1 - i]); diff --git a/compute/tensor/tests/test_rnn.cpp b/compute/tensor/tests/test_rnn.cpp index 556fa58a..f78b58b9 100644 --- a/compute/tensor/tests/test_rnn.cpp +++ b/compute/tensor/tests/test_rnn.cpp @@ -25,20 +25,20 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode) RNNParamSpec rnnParamSpec; rnnParamSpec.mode = mode; rnnParamSpec.steps = step; - rnnParamSpec.biDirection = false; - rnnParamSpec.numOutput = hDim; - rnnParamSpec.numProjection = 0; - rnnParamSpec.forgetBias = 1.0; - rnnParamSpec.activationMode = ACTIVATION_TANH; - rnnParamSpec.zoneoutCell = 0; - rnnParamSpec.zoneoutOutput = 0; + rnnParamSpec.bi_direction = false; + rnnParamSpec.num_outputs = hDim; + rnnParamSpec.num_projection = 0; + rnnParamSpec.forget_bias = 1.0; + rnnParamSpec.activation_type = ACTIVATION_TANH; + rnnParamSpec.zoneout_cell = 0; + rnnParamSpec.zoneout_output = 0; U32 weightNum = 1; U32 biasNum = 1; int factor = 0; switch (mode) { case RNN_LSTM: - rnnParamSpec.numProjection = 1024; + rnnParamSpec.num_projection = 1024; factor = 4; break; case RNN_GRU: @@ -52,39 +52,39 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode) return 1; } F32 threshold = 10; - if (rnnParamSpec.numProjection > 0) { + if (rnnParamSpec.num_projection > 0) { weightNum++; biasNum++; threshold = 40; } if (rnnParamSpec.mode != RNN_LSTM) { - rnnParamSpec.numProjection = 0; - rnnParamSpec.forgetBias = 0; + rnnParamSpec.num_projection = 0; + rnnParamSpec.forget_bias = 0; } - U32 column = (rnnParamSpec.numProjection > 0) ? rnnParamSpec.numProjection - : rnnParamSpec.numOutput; + U32 column = (rnnParamSpec.num_projection > 0) ? rnnParamSpec.num_projection + : rnnParamSpec.num_outputs; TensorDesc inputDesc = tensor3df(dt, DF_MTK, batch, step, xDim); Tensor inputTensor; inputTensor.resize(inputDesc); inputTensor.alloc(); U32 inputLength = batch * step * xDim; U8 *input = ut_input_v(inputLength, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inputDesc)); U32 tmpBytes; std::vector filterDesc(2), biasDesc(2); filterDesc[0] = tensor2df(dt, DF_NK, factor * column, xDim + hDim); - filterDesc[1] = tensor2df(dt, DF_NK, rnnParamSpec.numOutput, rnnParamSpec.numProjection); + filterDesc[1] = tensor2df(dt, DF_NK, rnnParamSpec.num_outputs, rnnParamSpec.num_projection); biasDesc[0] = tensor1d(dt, column * factor); - biasDesc[1] = tensor1d(dt, rnnParamSpec.numOutput); + biasDesc[1] = tensor1d(dt, rnnParamSpec.num_outputs); std::vector filterTensor(weightNum), biasTensor(biasNum); for (U32 i = 0; i < weightNum; i++) { filterTensor[i].resize(filterDesc[i]); filterTensor[i].alloc(); U8 *filter = ut_input_v(tensorNumBytes(filterDesc[i]) / bytesOf(dt), dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(filterTensor[i], CPU_GENERAL), filter, + UNI_MEMCPY(get_ptr_from_tensor(filterTensor[i], CPU_GENERAL), filter, tensorNumBytes(filterDesc[i])); free(filter); } @@ -93,7 +93,8 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode) biasTensor[i].resize(biasDesc[i]); biasTensor[i].alloc(); U8 *bias = ut_input_v(tensorNumBytes(biasDesc[i]) / bytesOf(dt), dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(biasTensor[i], CPU_GENERAL), bias, tensorNumBytes(biasDesc[i])); + UNI_MEMCPY( + get_ptr_from_tensor(biasTensor[i], CPU_GENERAL), bias, tensorNumBytes(biasDesc[i])); free(bias); } @@ -140,12 +141,12 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode) std::vector outputTensorRefVec(1, outputTensorRef); std::vector tmpTensorVec(1, tmpTensor); if (UT_CHECK) { - memset(get_ptr_from_tensor(tmpTensor, UT_CPU_ARCHINFO.arch), 0, tmpBytes); + UNI_MEMSET(get_ptr_from_tensor(tmpTensor, UT_CPU_ARCHINFO.arch), 0, tmpBytes); CHECK_STATUS(rnn(inputTensorVec, ftmTensor, biasTensor, rnnParamSpec, tmpTensorVec, outputTensorVec, &UT_CPU_ARCHINFO)); // naive implement - memset(get_ptr_from_tensor(tmpTensor, UT_CPU_ARCHINFO.arch), 0, tmpBytes); + UNI_MEMSET(get_ptr_from_tensor(tmpTensor, UT_CPU_ARCHINFO.arch), 0, tmpBytes); CHECK_STATUS(rnn(inputTensorVec, ftmTensorRef, biasTensor, rnnParamSpec, tmpTensorVec, outputTensorRefVec, &UT_SERIAL_ARCHINFO)); @@ -172,7 +173,7 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode) double hxDim = hDim + xDim; double ops = 1.0 * batch * step * (2.0 * hxDim * column * factor + column * factor + - rnnParamSpec.numProjection * rnnParamSpec.numOutput); + rnnParamSpec.num_projection * rnnParamSpec.num_outputs); ut_log(dt, buffer, ops, time); free(input); diff --git a/compute/tensor/tests/test_rnn_ocl.cpp b/compute/tensor/tests/test_rnn_ocl.cpp index 97df2e6f..51e60be1 100644 --- a/compute/tensor/tests/test_rnn_ocl.cpp +++ b/compute/tensor/tests/test_rnn_ocl.cpp @@ -16,12 +16,12 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode) { - U32 batch, step, xDim, hDim, numProjection, biDir; + U32 batch, step, xDim, hDim, num_projection, biDir; batch = atoi(argv[1]); step = atoi(argv[2]); xDim = atoi(argv[3]); hDim = atoi(argv[4]); - numProjection = atoi(argv[5]); + num_projection = atoi(argv[5]); biDir = atoi(argv[6]); ArchInfo archInfo; archInfo.arch = MALI; @@ -31,22 +31,22 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode) RNNParamSpec rnnParamSpec; rnnParamSpec.mode = RNN_LSTM; - rnnParamSpec.numOutput = hDim; - rnnParamSpec.numProjection = numProjection; - rnnParamSpec.forgetBias = 1.0; - rnnParamSpec.zoneoutCell = 0; - rnnParamSpec.zoneoutOutput = 0; + rnnParamSpec.num_outputs = hDim; + rnnParamSpec.num_projection = num_projection; + rnnParamSpec.forget_bias = 1.0; + rnnParamSpec.zoneout_cell = 0; + rnnParamSpec.zoneout_output = 0; rnnParamSpec.steps = 0; - rnnParamSpec.biDirection = (biDir) ? true : false; - rnnParamSpec.activationMode = ACTIVATION_TANH; + rnnParamSpec.bi_direction = (biDir) ? true : false; + rnnParamSpec.activation_type = ACTIVATION_TANH; - U32 col = (numProjection > 0) ? numProjection : hDim; + U32 col = (num_projection > 0) ? num_projection : hDim; TensorDesc inputDesc = tensor3df(dt, DF_NORMAL, batch, step, xDim); std::vector biasDesc(2); std::vector filterDesc(2); filterDesc[0] = tensor2df(dt, DF_NK, 4 * col, xDim + hDim); - filterDesc[1] = tensor2df(dt, DF_NK, hDim, numProjection); + filterDesc[1] = tensor2df(dt, DF_NK, hDim, num_projection); biasDesc[0] = tensor1d(dt, 4 * col); biasDesc[1] = tensor1d(dt, hDim); @@ -54,7 +54,7 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode) inputTensorCpu.resize(inputDesc); inputTensorCpu.alloc(); - U32 filterNum = (numProjection) ? 2 : 1; + U32 filterNum = (num_projection) ? 2 : 1; U32 biDirNum = (biDir) ? 2 : 1; std::vector filterTensorCpu(filterNum * biDirNum); std::vector biasTensorCpu(filterNum * biDirNum); @@ -70,7 +70,7 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode) Tensor outputTensorCpu; U32 inputLen = tensorNumElements(inputDesc); U8 *input_cpu = ut_input_v(inputLen, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, inputLen * bytesOf(dt)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, inputLen * bytesOf(dt)); std::vector bias_cpu(filterNum * biDirNum); std::vector filter_cpu(filterNum * biDirNum); @@ -78,12 +78,12 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode) for (U32 j = 0; j < filterNum; j++) { U32 len = tensorNumElements(biasDesc[j]); bias_cpu[i * filterNum + j] = ut_input_v(len, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(biasTensorCpu[i * filterNum + j], CPU_GENERAL), + UNI_MEMCPY(get_ptr_from_tensor(biasTensorCpu[i * filterNum + j], CPU_GENERAL), bias_cpu[i * filterNum + j], len * bytesOf(dt)); len = tensorNumElements(filterDesc[j]); filter_cpu[i * filterNum + j] = ut_input_v(len, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(filterTensorCpu[i * filterNum + j], CPU_GENERAL), + UNI_MEMCPY(get_ptr_from_tensor(filterTensorCpu[i * filterNum + j], CPU_GENERAL), filter_cpu[i * filterNum + j], len * bytesOf(dt)); } } @@ -103,7 +103,7 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode) TensorDesc tmpDesc = tensor1d(DT_U8, tmpBytes); tmpTensorCpu.resize(tmpDesc); tmpTensorCpu.alloc(); - memset(get_ptr_from_tensor(tmpTensorCpu, CPU_GENERAL), 0, tmpBytes); + UNI_MEMSET(get_ptr_from_tensor(tmpTensorCpu, CPU_GENERAL), 0, tmpBytes); std::vector ftmBytes(4); CHECK_STATUS(rnn_transform_filter_bytes( @@ -259,7 +259,7 @@ int rnnTest(int argc, char **argv, DataType dt, RNNMode mode) #ifdef _DEBUG double hxDim = hDim + xDim; double ops = 1.0 * batch * step * - (2.0 * hxDim * col * 4 + col * 4 + rnnParamSpec.numProjection * rnnParamSpec.numOutput); + (2.0 * hxDim * col * 4 + col * 4 + rnnParamSpec.num_projection * rnnParamSpec.num_outputs); ut_log(dt, buffer, ops, time); #endif ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, CPU_GENERAL), diff --git a/compute/tensor/tests/test_rnncell_ocl.cpp b/compute/tensor/tests/test_rnncell_ocl.cpp index ea952c69..16e1d17c 100644 --- a/compute/tensor/tests/test_rnncell_ocl.cpp +++ b/compute/tensor/tests/test_rnncell_ocl.cpp @@ -16,36 +16,36 @@ int rnncellTest(int argc, char **argv, DataType dt, RNNMode mode) { - U32 xDim, hDim, numProjection; + U32 xDim, hDim, num_projection; xDim = atoi(argv[1]); hDim = atoi(argv[2]); if (argc == 4) { - numProjection = atoi(argv[3]); + num_projection = atoi(argv[3]); } else { - numProjection = 0; + num_projection = 0; } ArchInfo archInfo; archInfo.arch = MALI; RNNParamSpec rnnParamSpec; rnnParamSpec.mode = RNN_LSTM; - rnnParamSpec.numOutput = hDim; - rnnParamSpec.numProjection = numProjection; - rnnParamSpec.forgetBias = 1.0; - rnnParamSpec.zoneoutCell = 0; - rnnParamSpec.zoneoutOutput = 0; + rnnParamSpec.num_outputs = hDim; + rnnParamSpec.num_projection = num_projection; + rnnParamSpec.forget_bias = 1.0; + rnnParamSpec.zoneout_cell = 0; + rnnParamSpec.zoneout_output = 0; rnnParamSpec.steps = -1; - rnnParamSpec.biDirection = false; - rnnParamSpec.activationMode = ACTIVATION_TANH; + rnnParamSpec.bi_direction = false; + rnnParamSpec.activation_type = ACTIVATION_TANH; - U32 col = (numProjection > 0) ? numProjection : hDim; + U32 col = (num_projection > 0) ? num_projection : hDim; TensorDesc inputDesc = tensor2df(dt, DF_NORMAL, 1, xDim); TensorDesc stateDesc = tensor2df(dt, DF_NORMAL, 1, col + hDim); std::vector biasDesc(2); std::vector filterDesc(2); filterDesc[0] = tensor2df(dt, DF_NK, 4 * col, xDim + hDim); - filterDesc[1] = tensor2df(dt, DF_NK, hDim, numProjection); + filterDesc[1] = tensor2df(dt, DF_NK, hDim, num_projection); biasDesc[0] = tensor1d(dt, 4 * col); biasDesc[1] = tensor1d(dt, hDim); @@ -69,10 +69,10 @@ int rnncellTest(int argc, char **argv, DataType dt, RNNMode mode) U32 inputLen = tensorNumElements(inputDesc); U32 stateLen = tensorNumElements(stateDesc); U8 *input_cpu = ut_input_v(inputLen, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, inputLen * bytesOf(dt)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, inputLen * bytesOf(dt)); U8 *state_cpu = ut_input_v(stateLen, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(stateTensorCpu, CPU_GENERAL), state_cpu, stateLen * bytesOf(dt)); + UNI_MEMCPY(get_ptr_from_tensor(stateTensorCpu, CPU_GENERAL), state_cpu, stateLen * bytesOf(dt)); U8 *state_gpu_host = ut_input_v(stateLen, dt, UT_INIT_ZERO); std::vector bias_cpu(2); @@ -80,11 +80,12 @@ int rnncellTest(int argc, char **argv, DataType dt, RNNMode mode) for (U32 i = 0; i < 2; i++) { U32 len = tensorNumElements(biasDesc[i]); bias_cpu[i] = ut_input_v(len, dt, UT_INIT_RANDOM); - memcpy(get_ptr_from_tensor(biasTensorCpu[i], CPU_GENERAL), bias_cpu[i], len * bytesOf(dt)); + UNI_MEMCPY( + get_ptr_from_tensor(biasTensorCpu[i], CPU_GENERAL), bias_cpu[i], len * bytesOf(dt)); len = tensorNumElements(filterDesc[i]); filter_cpu[i] = ut_input_v(len, dt, UT_INIT_RANDOM); - memcpy( + UNI_MEMCPY( get_ptr_from_tensor(filterTensorCpu[i], CPU_GENERAL), filter_cpu[i], len * bytesOf(dt)); } @@ -102,7 +103,7 @@ int rnncellTest(int argc, char **argv, DataType dt, RNNMode mode) TensorDesc tmpDesc = tensor1d(DT_U8, tmpBytes); tmpTensorCpu.resize(tmpDesc); tmpTensorCpu.alloc(); - memset(get_ptr_from_tensor(tmpTensorCpu, CPU_GENERAL), 0, tmpBytes); + UNI_MEMSET(get_ptr_from_tensor(tmpTensorCpu, CPU_GENERAL), 0, tmpBytes); std::vector ftmBytes(2); CHECK_STATUS(rnn_transform_filter_bytes( @@ -235,7 +236,7 @@ int rnncellTest(int argc, char **argv, DataType dt, RNNMode mode) #ifdef _DEBUG double hxDim = hDim + xDim; double ops = 1.0 * - (2.0 * hxDim * col * 4 + col * 4 + rnnParamSpec.numProjection * rnnParamSpec.numOutput); + (2.0 * hxDim * col * 4 + col * 4 + rnnParamSpec.num_projection * rnnParamSpec.num_outputs); ut_log(dt, buffer, ops, time); #endif ut_check_a(output_gpu, get_ptr_from_tensor(outputTensorCpu, CPU_GENERAL), diff --git a/compute/tensor/tests/test_roialign.cpp b/compute/tensor/tests/test_roialign.cpp index 6ff8ccde..c1e7633d 100644 --- a/compute/tensor/tests/test_roialign.cpp +++ b/compute/tensor/tests/test_roialign.cpp @@ -39,6 +39,7 @@ int roialignTest(int argc, char **argv, DataType dt) F32 spatial_scale = (F32)atof(argv[15]); RoIAlignParamSpec p; + p.mode = POOLING_MEAN; p.output_h = output_h; p.output_w = output_w; p.sampling_ratio = sampling_ratio; @@ -58,11 +59,11 @@ int roialignTest(int argc, char **argv, DataType dt) U8 *input_feat = ut_input_v(input_len_feat, dt, UT_INIT_RANDOM); U8 *input_rois = ut_input_v(input_len_rois, dt, UT_INIT_RANDOM); U8 *input_batch = ut_input_v(input_len_batch, dt, UT_INIT_ZERO); - memcpy(get_ptr_from_tensor(inputTensor_feat, CPU_GENERAL), input_feat, + UNI_MEMCPY(get_ptr_from_tensor(inputTensor_feat, CPU_GENERAL), input_feat, tensorNumBytes(inputDesc_feat)); - memcpy(get_ptr_from_tensor(inputTensor_rois, CPU_GENERAL), input_rois, + UNI_MEMCPY(get_ptr_from_tensor(inputTensor_rois, CPU_GENERAL), input_rois, tensorNumBytes(inputDesc_rois)); - memcpy(get_ptr_from_tensor(inputTensor_batch, CPU_GENERAL), input_batch, + UNI_MEMCPY(get_ptr_from_tensor(inputTensor_batch, CPU_GENERAL), input_batch, tensorNumBytes(inputDesc_batch)); inputTensors[0] = inputTensor_feat; inputTensors[1] = inputTensor_rois; diff --git a/compute/tensor/tests/test_roialign_ocl.cpp b/compute/tensor/tests/test_roialign_ocl.cpp index 8d3882a4..297b315b 100644 --- a/compute/tensor/tests/test_roialign_ocl.cpp +++ b/compute/tensor/tests/test_roialign_ocl.cpp @@ -58,7 +58,7 @@ int roialignTest(int argc, char *argv[], DataType dt) } RoIAlignParamSpec p; - p.coordinateTransformationMode = ROIALIGN_HALF_PIXEL; + p.trans_mode = COORDINATE_TRANS_HALF_PIXEL; p.mode = POOLING_MEAN; p.output_w = ow; p.output_h = oh; @@ -88,7 +88,7 @@ int roialignTest(int argc, char *argv[], DataType dt) inTensorsCpu[i].resize(roiDesc); } inTensorsCpu[i].alloc(); - memcpy(get_ptr_from_tensor(inTensorsCpu[i], CPU_GENERAL), inputCpu[i], + UNI_MEMCPY(get_ptr_from_tensor(inTensorsCpu[i], CPU_GENERAL), inputCpu[i], tensorNumBytes(inTensorsCpu[i].get_desc())); inTensorPtrCpu[i] = &inTensorsCpu[i]; } diff --git a/compute/tensor/tests/test_scale.cpp b/compute/tensor/tests/test_scale.cpp index 9bf5a068..82a92609 100644 --- a/compute/tensor/tests/test_scale.cpp +++ b/compute/tensor/tests/test_scale.cpp @@ -35,8 +35,8 @@ int scaleTest(int argc, char **argv, DataType dt) dataTensorRef.resize(inDesc); dataTensor.alloc(); dataTensorRef.alloc(); - memcpy(get_ptr_from_tensor(dataTensor, CPU_GENERAL), data, tensorNumBytes(inDesc)); - memcpy(get_ptr_from_tensor(dataTensorRef, CPU_GENERAL), data, tensorNumBytes(inDesc)); + UNI_MEMCPY(get_ptr_from_tensor(dataTensor, CPU_GENERAL), data, tensorNumBytes(inDesc)); + UNI_MEMCPY(get_ptr_from_tensor(dataTensorRef, CPU_GENERAL), data, tensorNumBytes(inDesc)); U8 *alpha = ut_input_v(ic, dt, UT_INIT_RANDOM); U8 *beta = ut_input_v(ic, dt, UT_INIT_RANDOM); diff --git a/compute/tensor/tests/test_scale_ocl.cpp b/compute/tensor/tests/test_scale_ocl.cpp index e9c4fa07..7f9e56ae 100644 --- a/compute/tensor/tests/test_scale_ocl.cpp +++ b/compute/tensor/tests/test_scale_ocl.cpp @@ -65,7 +65,8 @@ int scaleTest(int argc, char **argv, DataType dt) Tensor inputTensorCpu, outputTensorCpu; inputTensorCpu.resize(inputDesc); inputTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), inputCpu, tensorNumBytes(inputDesc)); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), inputCpu, tensorNumBytes(inputDesc)); CHECK_STATUS( scale_infer_output_size(&inputTensorCpu, p, axisLen, &outputTensorCpu, &UT_SERIAL_ARCHINFO)); outputTensorCpu.alloc(); diff --git a/compute/tensor/tests/test_slice.cpp b/compute/tensor/tests/test_slice.cpp index a0f1142b..3d3b9e11 100644 --- a/compute/tensor/tests/test_slice.cpp +++ b/compute/tensor/tests/test_slice.cpp @@ -27,8 +27,8 @@ int sliceTest(int argc, char **argv, DataType dt) U32 iw = atoi(argv[5]); SliceParamSpec p; p.axis = atoi(argv[6]); - p.slice_size = num - 1; - for (U32 i = 0; i < p.slice_size; i++) { + p.num_slice = num - 1; + for (U32 i = 0; i < p.num_slice; i++) { p.slice_points[i] = atoi(argv[7 + i]); } @@ -39,7 +39,7 @@ int sliceTest(int argc, char **argv, DataType dt) Tensor inputTensor; inputTensor.resize(inDesc); inputTensor.alloc(); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc)); std::vector outputTensors(num); std::vector outputTensorsPtr(num); diff --git a/compute/tensor/tests/test_slice_ocl.cpp b/compute/tensor/tests/test_slice_ocl.cpp index b0ec7672..9870f343 100644 --- a/compute/tensor/tests/test_slice_ocl.cpp +++ b/compute/tensor/tests/test_slice_ocl.cpp @@ -27,8 +27,8 @@ int sliceTest(int argc, char **argv, DataType dt) U32 iw = atoi(argv[5]); SliceParamSpec p; p.axis = atoi(argv[6]); - p.slice_size = num - 1; - for (U32 i = 0; i < p.slice_size; i++) { + p.num_slice = num - 1; + for (U32 i = 0; i < p.num_slice; i++) { p.slice_points[i] = atoi(argv[7 + i]); } ArchInfo archInfo; @@ -45,7 +45,7 @@ int sliceTest(int argc, char **argv, DataType dt) Tensor inputTensorCpu; inputTensorCpu.resize(inDesc); inputTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), inputCpu, tensorNumBytes(inDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), inputCpu, tensorNumBytes(inDesc)); std::vector outputTensorsCpu(num); std::vector outputTensorsPtrCpu(num); for (I32 i = 0; i < num; i++) { diff --git a/compute/tensor/tests/test_softmax.cpp b/compute/tensor/tests/test_softmax.cpp index 4b3a9361..210932eb 100644 --- a/compute/tensor/tests/test_softmax.cpp +++ b/compute/tensor/tests/test_softmax.cpp @@ -24,7 +24,7 @@ int softmaxTest(int argc, char **argv, DataType dt) TensorDesc inDesc = tensor2df(dt, DF_NORMAL, 1, len); U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM); Tensor inputTensor = Tensor::alloc_sized(inDesc); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc)); Tensor outputTensor; CHECK_STATUS(softmax_infer_output_size(&inputTensor, p, &outputTensor, &UT_CPU_ARCHINFO)); diff --git a/compute/tensor/tests/test_softmax_ocl.cpp b/compute/tensor/tests/test_softmax_ocl.cpp index f640f1bd..b1c01d90 100644 --- a/compute/tensor/tests/test_softmax_ocl.cpp +++ b/compute/tensor/tests/test_softmax_ocl.cpp @@ -108,7 +108,7 @@ int softmaxTest(int argc, char **argv, DataType dt) Tensor inputTensorCpu; inputTensorCpu.resize(in_desc); inputTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(in_desc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(in_desc)); Tensor outputTensorCpu; outputTensorCpu.resize(out_desc); diff --git a/compute/tensor/tests/test_split.cpp b/compute/tensor/tests/test_split.cpp index 03699ca6..8102da9b 100644 --- a/compute/tensor/tests/test_split.cpp +++ b/compute/tensor/tests/test_split.cpp @@ -32,7 +32,7 @@ int splitTest(int argc, char **argv, DataType dt) Tensor inputTensor; inputTensor.resize(inDesc); inputTensor.alloc(); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc)); std::vector outputTensors(num); std::vector outputTensorsPtr(num); diff --git a/compute/tensor/tests/test_tfslice_ocl.cpp b/compute/tensor/tests/test_tfslice_ocl.cpp index 5ac0eec4..0afd6386 100644 --- a/compute/tensor/tests/test_tfslice_ocl.cpp +++ b/compute/tensor/tests/test_tfslice_ocl.cpp @@ -49,7 +49,8 @@ int tfsliceTest(int argc, char **argv, DataType dt) Tensor inputTensorCpu; inputTensorCpu.resize(inputDesc); inputTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc)); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc)); Tensor outputTensorCpu; Tensor tmpTensorCpu; @@ -110,7 +111,7 @@ int tfsliceTest(int argc, char **argv, DataType dt) char buffer[150]; char params[120]; - memset(params, 0, 120); + UNI_MEMSET(params, 0, 120); sprintf(params, "(%u %u %u %u)=(%u %u %u %u)", in, ic, ih, iw, on, oc, oh, ow); sprintf(buffer, "%20s, %80s", "tfslice", params); #ifdef _DEBUG diff --git a/compute/tensor/tests/test_tile.cpp b/compute/tensor/tests/test_tile.cpp index 76ad94d9..45a9bade 100644 --- a/compute/tensor/tests/test_tile.cpp +++ b/compute/tensor/tests/test_tile.cpp @@ -23,8 +23,8 @@ int tileTest(int argc, char **argv, DataType dt) //input axis and tiles TileParamSpec tileParamSpec; tileParamSpec.axis = atoi(argv[5]); - tileParamSpec.dimsSize = 1; - tileParamSpec.repeatsInfo[0] = atoi(argv[6]); + tileParamSpec.num_repeats = 1; + tileParamSpec.repeats[0] = atoi(argv[6]); //set input DataFormat df = DF_NCHW; @@ -32,7 +32,7 @@ int tileTest(int argc, char **argv, DataType dt) U32 len = tensorNumElements(inDesc); U8 *input = ut_input_v(len, dt, UT_INIT_RANDOM); Tensor inputTensor = Tensor::alloc_sized(inDesc); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, inputTensor.bytes()); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, inputTensor.bytes()); //set output Tensor outputTensor; @@ -43,7 +43,7 @@ int tileTest(int argc, char **argv, DataType dt) Tensor tmpTensor; CHECK_STATUS(tile(inputTensor, tileParamSpec, tmpTensor, outputTensor, &UT_CPU_ARCHINFO)); - CHECK_REQUIREMENT(outputTensor.length() == (len * tileParamSpec.repeatsInfo[0])); + CHECK_REQUIREMENT(outputTensor.length() == (len * tileParamSpec.repeats[0])); } return 0; diff --git a/compute/tensor/tests/test_tile_ocl.cpp b/compute/tensor/tests/test_tile_ocl.cpp index 4ebbb5b2..d19c0d42 100644 --- a/compute/tensor/tests/test_tile_ocl.cpp +++ b/compute/tensor/tests/test_tile_ocl.cpp @@ -22,12 +22,12 @@ int tileTest(int argc, char **argv, DataType dt) } U32 iDim[8]; TileParamSpec tileParamSpec; - tileParamSpec.dimsSize = nDims; + tileParamSpec.num_repeats = nDims; for (U32 i = 2; i < nDims + 2; i++) { iDim[i - 2] = atoi(argv[i]); } for (U32 i = nDims + 2; i < 2 * nDims + 2; i++) { - tileParamSpec.repeatsInfo[i - nDims - 2] = atoi(argv[i]); + tileParamSpec.repeats[i - nDims - 2] = atoi(argv[i]); } ArchInfo archInfo; @@ -58,7 +58,8 @@ int tileTest(int argc, char **argv, DataType dt) Tensor inputTensorCpu, outputTensorCpu, tmpTensorCpu; inputTensorCpu.resize(inputDesc); inputTensorCpu.alloc(); - memcpy(get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc)); + UNI_MEMCPY( + get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc)); CHECK_STATUS(tile_infer_output_size( &inputTensorCpu, tileParamSpec, &outputTensorCpu, &UT_SERIAL_ARCHINFO)); @@ -122,7 +123,7 @@ int tileTest(int argc, char **argv, DataType dt) char buffer[150]; char params[120]; - memset(params, 0, 120); + UNI_MEMSET(params, 0, 120); sprintf(params, "("); for (U32 i = 0; i < inputDesc.nDims; i++) { if (i != inputDesc.nDims - 1) { diff --git a/compute/tensor/tests/test_topk_ocl.cpp b/compute/tensor/tests/test_topk_ocl.cpp index 66352a03..251f7567 100644 --- a/compute/tensor/tests/test_topk_ocl.cpp +++ b/compute/tensor/tests/test_topk_ocl.cpp @@ -18,14 +18,14 @@ inline void topk_cpu_max(F16 *input, U32 len, U32 topk, F16 *output, I32 *output { for (U32 i = 0; i < topk; i++) { U32 index = 0; - F16 max_val = -65536; + F16 max_val = -UNI_F16_MAX; for (U32 j = 0; j < len; j++) { if (input[j] > max_val) { max_val = input[j]; index = j; } } - input[index] = -65536; + input[index] = -UNI_F16_MAX; output[i] = max_val; outputId[i] = index; } @@ -36,8 +36,8 @@ inline void sort_gpu_result( { std::vector skip_j; for (U32 i = 0; i < topk; i++) { - F16 max_val = -65536; - I32 index = 65536; + F16 max_val = -UNI_F16_MAX; + I32 index = UNI_F16_MAX; U32 sj = 0; for (U32 j = 0; j < topk; j++) { bool skip = false; @@ -73,7 +73,7 @@ int topkTest(int argc, char **argv, DataType dt) U32 iw = 3000; TopKParamSpec p; p.axis = 0; - p.topk = 30; + p.k = 30; p.largest = 1; p.sorted = 0; if (argc == 8) { @@ -82,7 +82,7 @@ int topkTest(int argc, char **argv, DataType dt) ih = atoi(argv[3]); iw = atoi(argv[4]); p.axis = atof(argv[5]); - p.topk = atof(argv[6]); + p.k = atof(argv[6]); p.largest = atof(argv[7]); p.sorted = atof(argv[8]); } @@ -94,8 +94,8 @@ int topkTest(int argc, char **argv, DataType dt) U32 len = in * ic * ih * iw; TensorDesc input_desc_cpu = tensor1d(dt, len); - TensorDesc output_desc_cpu = tensor1d(dt, (U32)p.topk); - TensorDesc output_indices_desc_cpu = tensor1d(DT_I32, (U32)p.topk); + TensorDesc output_desc_cpu = tensor1d(dt, (U32)p.k); + TensorDesc output_indices_desc_cpu = tensor1d(DT_I32, (U32)p.k); TensorDesc input_desc_gpu = tensor1d(dt, len); TensorDesc output_desc_gpu, output_indices_desc_gpu; @@ -166,16 +166,16 @@ int topkTest(int argc, char **argv, DataType dt) sprintf(params, "(%u %u %u %u) = (%u %u %u %u)", in, ic, ih, iw, on, oc, oh, ow); sprintf(buffer, "16bit%20s, %80s", "topk", params); - F16 *output_cpu = (F16 *)malloc(sizeof(F16) * p.topk); - I32 *output_id_cpu = (I32 *)malloc(sizeof(I32) * p.topk); - F16 *res_gpu_sort = (F16 *)malloc(sizeof(F16) * p.topk); - I32 *res_id_gpu_sort = (I32 *)malloc(sizeof(I32) * p.topk); - topk_cpu_max((F16 *)input_cpu, len, p.topk, output_cpu, output_id_cpu); + F16 *output_cpu = (F16 *)malloc(sizeof(F16) * p.k); + I32 *output_id_cpu = (I32 *)malloc(sizeof(I32) * p.k); + F16 *res_gpu_sort = (F16 *)malloc(sizeof(F16) * p.k); + I32 *res_id_gpu_sort = (I32 *)malloc(sizeof(I32) * p.k); + topk_cpu_max((F16 *)input_cpu, len, p.k, output_cpu, output_id_cpu); sort_gpu_result( - (F16 *)output_gpu, (I32 *)output_indices_gpu, p.topk, res_gpu_sort, res_id_gpu_sort); + (F16 *)output_gpu, (I32 *)output_indices_gpu, p.k, res_gpu_sort, res_id_gpu_sort); - ut_check_a(res_gpu_sort, output_cpu, p.topk, dt); - ut_check_a(res_id_gpu_sort, output_id_cpu, p.topk, dt); + ut_check_a(res_gpu_sort, output_cpu, p.k, dt); + ut_check_a(res_id_gpu_sort, output_id_cpu, p.k, dt); CHECK_STATUS(gcl_finish(handle)); CHECK_STATUS(gcl_clean_kernelVec(handle)); diff --git a/compute/tensor/tests/test_transpose.cpp b/compute/tensor/tests/test_transpose.cpp index 0d707b90..2df702ec 100644 --- a/compute/tensor/tests/test_transpose.cpp +++ b/compute/tensor/tests/test_transpose.cpp @@ -24,12 +24,12 @@ int transposeTest(int argc, char **argv, DataType dt) U32 ih = atoi(argv[3]); U32 iw = atoi(argv[4]); TransposeParamSpec p, p_inv; - p.trans_size = 4; - p_inv.trans_size = 4; - for (int i = 0; i < 4; i++) { + p.num_axes = 4; + p_inv.num_axes = 4; + for (U32 i = 0; i < p_inv.num_axes; i++) { I32 value = atoi(argv[5 + i]); - p.trans_dims[i] = value; - p_inv.trans_dims[value] = i; + p.axes[i] = value; + p_inv.axes[value] = i; } DataFormat df = DF_NCHW; @@ -39,7 +39,7 @@ int transposeTest(int argc, char **argv, DataType dt) Tensor inputTensor; inputTensor.resize(inDesc); inputTensor.alloc(); - memcpy(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc)); + UNI_MEMCPY(get_ptr_from_tensor(inputTensor, CPU_GENERAL), input, tensorNumBytes(inDesc)); Tensor outputTensor1; Tensor outputTensor2; diff --git a/compute/tensor/tests/test_transpose_ocl.cpp b/compute/tensor/tests/test_transpose_ocl.cpp index 0386d451..16585391 100644 --- a/compute/tensor/tests/test_transpose_ocl.cpp +++ b/compute/tensor/tests/test_transpose_ocl.cpp @@ -31,13 +31,13 @@ int transposeTest(int argc, char **argv, DataType dt) inputDesc_cpu.df = DF_NCHW; } CHECK_REQUIREMENT(argc == (int)(nDims * 2 + 2)); - p.trans_size = nDims; - p_inv.trans_size = nDims; + p.num_axes = nDims; + p_inv.num_axes = nDims; for (U32 i = 0; i < nDims; i++) { inputDesc_cpu.dims[nDims - 1 - i] = atoi(argv[2 + i]); I32 value = atoi(argv[2 + nDims + i]); - p.trans_dims[i] = value; - p_inv.trans_dims[value] = i; + p.axes[i] = value; + p_inv.axes[value] = i; } inputDesc_gpu = inputDesc_cpu; @@ -52,7 +52,7 @@ int transposeTest(int argc, char **argv, DataType dt) Tensor inputTensorCpu; inputTensorCpu.resize(inputDesc_cpu); inputTensorCpu.alloc(); - memcpy( + UNI_MEMCPY( get_ptr_from_tensor(inputTensorCpu, CPU_GENERAL), input_cpu, tensorNumBytes(inputDesc_cpu)); Tensor outputTensorCpu; Tensor tmpTensorCpu; diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 5f8612e2..ff0020ea 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -20,7 +20,9 @@ - [engine](../inference/engine) hosts the inference engine of neural networks. - [flow](../inference/flow) hosts the multi-backends(CPU+GPU) heterogeneous device schedule for time series data. - [examples](../inference/examples) gives some application examples (Network Benchmark, ImageNet classification). -- [kit](../kit) +- [kit](../kit) - kit provides some application demos. +- [Training](../training) + - training provides all on-device training modules and examples. For API, Flow and operator development, please refer to [DEVELOPER.md](DEVELOPER.md). diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 05400769..73ac5ac2 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -7,16 +7,54 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](). +### [1.3.0] - 2022-2-28 + +#### Added + +- Support on-device training for MLP, CNN(lenet, resnet50, mobilnetv1), transformer/bert(text to speech) +- Support change model input and output names in X2bolt +- Support more graph optimizations : Transpose+Convolution, Swish, Quantization, Power+Scale +- Support dynamic output related operators : Shape, ConstantOfShape, GenerateProposals, NonZero, NonMaxSuppression, Reshape, etc +- Support more operators : GridSample, CumSum, OneHot, Round, Floor, Ceil +- Support more networks on CPU : yolov2, yolov3, yolov4, yolov5, faster-rcnn, mask-rcnn, retinanet, dfsmn, frill, conformer, unet, etc +- Support Armv8 int8 to accelerate NLP network +- Improve inference performance on avx2 CPU +- Support netron to visualize bolt model +- Support not to bind CPU core +- Add C API MemoryCheck to check bolt memory leak + +#### Changed + +- X2bolt add -I and -O options to change model input and output names. +- X2bolt add -t option to convert model for on-device training. +- C API CreateModel and AllocAllResultHandle return value is set to NULL when unsuccessful. +- install.sh add --neon option to close arm neon acceleration on old platform. +- some operator parameter defination + +#### Fixed + +- Fix GPU depth2space and deconv bug +- Fix GPU preprocess tool on armv8 platform bug +- Fix x86 Sigmoid precision +- Fix C API CloneResultHandle bug +- Fix mobilnetv1 int8 inference +- Fix Java API build bug on Windows +- Fix ONNX converter deconv, pooling parameter bug + +#### Removed + +- Equal operator is replaced with Check. + ### [1.2.1] - 2021-9-11 #### Added - Support more graph optimizations : Convolution+Convolution, LayerNorm -- Support more operators: ROIAlign, GenerateProposals, Reciprocal, Not, Log, ReductionL2, InstanceNorm, Expand, Gather, Scatter +- Support more operators : ROIAlign, GenerateProposals, Reciprocal, Not, Log, ReductionL2, InstanceNorm, Expand, Gather, Scatter - Support more operators(PReLU) process NCHW input data. - Support ONNX share weight between Linear, MatMul, Gemm and Gather -- Support more networks on CPU: vision transformer(ViT, TNT), recommendation networks +- Support more networks on CPU : vision transformer(ViT, TNT), recommendation networks - Support more networks on GPU : ASR, Faster_RCNN - Support Armv7 int8 to accelerate NLP network(50%+ speed-up) - Support X86 AVX512 int8 to accelerate NLP network(3x+ speed-up) diff --git a/docs/DEVELOPER.md b/docs/DEVELOPER.md index 8e69f718..698462cc 100644 --- a/docs/DEVELOPER.md +++ b/docs/DEVELOPER.md @@ -24,7 +24,7 @@ you can customize the unsupported operators step by step which has been describe ### C API -Bolt provides C API document generated by doxygen to help you use [C API](../inference/engine/api/c/bolt.h), [image classification example](../inference/examples/c_api/c_image_classifification.c) and [Chinese input method example](../inference/examples/c_api/c_input_method.c). +Bolt provides C API document generated by doxygen to help you use [C API](../inference/engine/api/c/bolt.h), [image classification example](../inference/examples/c_api/c_image_classification.c) and [Chinese input method example](../inference/examples/c_api/c_input_method.c). You can compile it and link *libbolt.so* library with your C/C++ project. ### Java API @@ -120,13 +120,13 @@ In [model_tools](../model_tools), you can define any operator for model conversi unsigned int stride_t; unsigned int stride_h; unsigned int stride_w; - unsigned int padding_before; - unsigned int padding_after; - unsigned int padding_top; - unsigned int padding_bottom; - unsigned int padding_left; - unsigned int padding_right; - RoundMode rm; + unsigned int pad_before; + unsigned int pad_after; + unsigned int pad_top; + unsigned int pad_bottom; + unsigned int pad_left; + unsigned int pad_right; + RoundMode round_mode; PoolingMode mode; } PoolingParamSpec; // <====== Addition @@ -149,13 +149,11 @@ In [model_tools](../model_tools), you can define any operator for model conversi ```c++ OperatorType convert_caffe_type(std::string inputType) { - // Addition ======> - if (inputType == "Pooling") { - return OT_Pooling; - } // <====== Addition - else if (inputType == "Convolution") { - ... - } + std::map operatorMap = { + // Addition ======> + {"Pooling", OT_Pooling}, + // <====== Addition + }; } ``` @@ -163,13 +161,11 @@ In [model_tools](../model_tools), you can define any operator for model conversi ```c++ virtual EE adapt_operator(OperatorType type, ParameterSpec *ps) { - ... - // Addition ======> - else if (type == OT_Pooling) { - *ps = adapt_Pooling(); - } - // <====== Addition - ... + std::map functions = { + // Addition ======> + {OT_Pooling, &ModelAdaptee::adapt_Pooling}, + // <====== Addition + }; } // Addition ======> @@ -183,62 +179,62 @@ In [model_tools](../model_tools), you can define any operator for model conversi // Addition ======> ParameterSpec adapt_Pooling() override { - ParameterSpec curPs; - memset(&curPs, 0, sizeof(curPs)); - PoolingParamSpec pps; - memset(&pps, 0, sizeof(pps)); - pps.kernel_t = 1; - pps.stride_t = 1; - pps.padding_before = 0; - pps.padding_after = 0; - if (layer.pooling_param().has_kernel_w() && layer.pooling_param().has_kernel_h()) { - pps.kernel_w = layer.pooling_param().kernel_w(); - pps.kernel_h = layer.pooling_param().kernel_h(); + ParameterSpec ps; + PoolingParamSpec p; + memset(&p, 0, sizeof(p)); + p.kernel_t = 1; + p.stride_t = 1; + p.pad_before = 0; + p.pad_after = 0; + auto cp = layer.pooling_param(); + if (cp.has_kernel_w() && cp.has_kernel_h()) { + p.kernel_w = cp.kernel_w(); + p.kernel_h = cp.kernel_h(); } else { - pps.kernel_h = layer.pooling_param().kernel_size(); - pps.kernel_w = pps.kernel_h; + p.kernel_h = cp.kernel_size(); + p.kernel_w = p.kernel_h; } - if (layer.pooling_param().has_stride_w() && layer.pooling_param().has_stride_h()) { - pps.stride_w = layer.pooling_param().stride_w(); - pps.stride_h = layer.pooling_param().stride_h(); + if (cp.has_stride_w() && cp.has_stride_h()) { + p.stride_w = cp.stride_w(); + p.stride_h = cp.stride_h(); } else { - pps.stride_h = layer.pooling_param().stride(); - pps.stride_w = pps.stride_h; + p.stride_h = cp.stride(); + p.stride_w = p.stride_h; } - bool global_pooling = layer.pooling_param().global_pooling(); + bool global_pooling = cp.global_pooling(); if (global_pooling) { - pps.kernel_h = 0; - pps.kernel_w = 0; - pps.stride_h = 1; - pps.stride_w = 1; + p.kernel_h = 0; + p.kernel_w = 0; + p.stride_h = 1; + p.stride_w = 1; } else { - CHECK_REQUIREMENT(pps.kernel_h > 0); + CHECK_REQUIREMENT(p.kernel_h > 0); } - if (layer.pooling_param().has_pad_w() && layer.pooling_param().has_pad_h()) { - pps.padding_left = layer.pooling_param().pad_w(); - pps.padding_right = pps.padding_left; - pps.padding_top = layer.pooling_param().pad_h(); - pps.padding_bottom = pps.padding_top; + if (cp.has_pad_w() && cp.has_pad_h()) { + p.pad_left = cp.pad_w(); + p.pad_right = p.pad_left; + p.pad_top = cp.pad_h(); + p.pad_bottom = p.pad_top; } else { - pps.padding_top = layer.pooling_param().has_pad() ? layer.pooling_param().pad() : 0; - pps.padding_bottom = pps.padding_top; - pps.padding_left = pps.padding_top; - pps.padding_right = pps.padding_top; + p.pad_top = cp.has_pad() ? cp.pad() : 0; + p.pad_bottom = p.pad_top; + p.pad_left = p.pad_top; + p.pad_right = p.pad_top; } - if (layer.pooling_param().has_round_mode() && layer.pooling_param().round_mode() == 1) { - pps.rm = FLOOR; + if (cp.has_round_mode() && cp.round_mode() == 1) { + p.round_mode = ROUND_FLOOR; } else { - pps.rm = CEIL; + p.round_mode = ROUND_CEIL; } - auto op = layer.pooling_param().pool(); + auto op = cp.pool(); switch (op) { case caffe::PoolingParameter_PoolMethod_MAX: { - pps.mode = POOLING_MAX; + p.mode = POOLING_MAX; break; } case caffe::PoolingParameter_PoolMethod_AVE: { - pps.mode = POOLING_MEAN; + p.mode = POOLING_MEAN; break; } default: { @@ -248,9 +244,9 @@ In [model_tools](../model_tools), you can define any operator for model conversi this->layer.name().c_str(), descriptor->FindValueByNumber(op)->name().c_str()); } } - curPs.pooling_spec = pps; - return curPs; - } + ps.pooling_spec = p; + return ps; + } // <====== Addition ``` @@ -274,13 +270,13 @@ In [model_tools](../model_tools), you can define any operator for model conversi ```c++ OperatorType convert_onnx_type(std::string inputType) { - // Addition ======> - if (inputType == "AveragePool" || inputType == "MaxPool" || inputType == "GlobalAveragePool") { - return OT_Pooling; - } // <====== Addition - else if (inputType == "Conv") { - ... - } + std::map operatorMap = { + // Addition ======> + {"AveragePool", OT_Pooling}, + {"MaxPool", OT_Pooling}, + {"GlobalAveragePool", OT_Pooling}, + // <====== Addition + }; } ``` @@ -288,13 +284,11 @@ In [model_tools](../model_tools), you can define any operator for model conversi ```c++ virtual EE adapt_operator(OperatorType type, ParameterSpec *ps) { - ... - // Addition ======> - else if (type == OT_Pooling) { - *ps = adapt_Pooling(); - } - // <====== Addition - ... + std::map functions = { + // Addition ======> + {OT_Pooling, &ModelAdaptee::adapt_Pooling}, + // <====== Addition + }; } // Addition ======> @@ -308,82 +302,89 @@ In [model_tools](../model_tools), you can define any operator for model conversi // Addition ======> ParameterSpec adapt_Pooling() override { - ParameterSpec curPs; - memset(&curPs, 0, sizeof(curPs)); - PoolingParamSpec pps; - memset(&pps, 0, sizeof(pps)); - std::string autoPad = get_node_str_attribute_by_name(node, "auto_pad"); // deprecated - std::vector kernelShape = get_node_vector_ints_attribute_by_name(node, "kernel_shape"); - std::vector strides = get_node_vector_ints_attribute_by_name(node, "strides"); - std::vector pads = get_node_vector_ints_attribute_by_name(node, "pads"); - - if (op == "AveragePool" || op == "ReduceMean" || op == "GlobalAveragePool") { - pps.mode = POOLING_MEAN; + ParameterSpec ps; + PoolingParamSpec p; + memset(&p, 0, sizeof(p)); + std::string autoPad = get_string(this->onnxNode, "auto_pad"); + std::vector kernels = get_ints(this->onnxNode, "kernel_shape"); + std::vector strides = get_ints(this->onnxNode, "strides"); + std::vector pads = get_ints(this->onnxNode, "pads"); + int ceil_mode = get_int(this->onnxNode, "ceil_mode", 0); + + const std::string &onnxNodeType = this->onnxNode.op_type(); + if (onnxNodeType == "AveragePool" || onnxNodeType == "ReduceMean" || + onnxNodeType == "GlobalAveragePool") { + p.mode = POOLING_MEAN; } else { - pps.mode = POOLING_MAX; + p.mode = POOLING_MAX; } - if (autoPad == "SAME_UPPER") { - pps.rm = CEIL; + if (ceil_mode) { + p.round_mode = ROUND_CEIL; } else { - pps.rm = FLOOR; + p.round_mode = ROUND_FLOOR; } - pps.kernel_t = 0; - pps.kernel_h = 0; - pps.kernel_w = 0; - if (kernelShape.size() == 3) { - pps.kernel_t = kernelShape[0]; - pps.kernel_h = kernelShape[1]; - pps.kernel_w = kernelShape[2]; - } else if (kernelShape.size() == 2) { - pps.kernel_t = 1; - pps.kernel_h = kernelShape[0]; - pps.kernel_w = kernelShape[1]; - } else if (kernelShape.size() == 1) { - pps.kernel_t = 1; - pps.kernel_h = kernelShape[0]; - pps.kernel_w = 1; + p.kernel_t = 0; + p.kernel_h = 0; + p.kernel_w = 0; + if (kernels.size() == 3) { + p.kernel_t = kernels[0]; + p.kernel_h = kernels[1]; + p.kernel_w = kernels[2]; + } else if (kernels.size() == 2) { + p.kernel_t = 1; + p.kernel_h = kernels[0]; + p.kernel_w = kernels[1]; + } else if (kernels.size() == 1) { + p.kernel_t = 1; + p.kernel_h = kernels[0]; + p.kernel_w = 1; } - pps.stride_t = 1; - pps.stride_h = 1; - pps.stride_w = 1; + p.stride_t = 1; + p.stride_h = 1; + p.stride_w = 1; if (strides.size() == 3) { - pps.stride_t = strides[0]; - pps.stride_h = strides[1]; - pps.stride_w = strides[2]; + p.stride_t = strides[0]; + p.stride_h = strides[1]; + p.stride_w = strides[2]; } else if (strides.size() == 2) { - pps.stride_h = strides[0]; - pps.stride_w = strides[1]; + p.stride_h = strides[0]; + p.stride_w = strides[1]; } else if (strides.size() == 1) { - pps.stride_h = strides[0]; + p.stride_h = strides[0]; } - pps.padding_before = 0; - pps.padding_top = 0; - pps.padding_left = 0; - pps.padding_after = 0; - pps.padding_bottom = 0; - pps.padding_right = 0; + p.pad_before = 0; + p.pad_top = 0; + p.pad_left = 0; + p.pad_after = 0; + p.pad_bottom = 0; + p.pad_right = 0; if (pads.size() == 6) { - pps.padding_before = pads[0]; - pps.padding_top = pads[1]; - pps.padding_left = pads[2]; - pps.padding_after = pads[3]; - pps.padding_bottom = pads[4]; - pps.padding_right = pads[5]; + p.pad_before = pads[0]; + p.pad_top = pads[1]; + p.pad_left = pads[2]; + p.pad_after = pads[3]; + p.pad_bottom = pads[4]; + p.pad_right = pads[5]; } else if (pads.size() == 4) { - pps.padding_top = pads[0]; - pps.padding_left = pads[1]; - pps.padding_bottom = pads[2]; - pps.padding_right = pads[3]; + p.pad_top = pads[0]; + p.pad_left = pads[1]; + p.pad_bottom = pads[2]; + p.pad_right = pads[3]; } else if (pads.size() == 2) { - pps.padding_top = pads[0]; - pps.padding_bottom = pads[1]; + p.pad_top = pads[0]; + p.pad_bottom = pads[1]; + } else if (autoPad == "SAME_UPPER") { + p.pad_top = (p.kernel_h - 1) / 2; + p.pad_bottom = (p.kernel_h - 1) - p.pad_top; + p.pad_left = (p.kernel_w - 1) / 2; + p.pad_right = (p.kernel_w - 1) - p.pad_left; } - curPs.pooling_spec = pps; - return curPs; + ps.pooling_spec = p; + return ps; } // <======= Addition ``` @@ -408,13 +409,12 @@ In [model_tools](../model_tools), you can define any operator for model conversi ```c++ OperatorType convert_tflite_type(tflite::BuiltinOperator tfliteType) { - // Addition ======> - if (tfliteType == tflite::BuiltinOperator_MAX_POOL_2D || tfliteOperatorType == tflite::BuiltinOperator_AVERAGE_POOL_2D) { - return OT_Pooling; - } // <====== Addition - else if (tfliteType == tflite::BuiltinOperator_CONCATENATION) { - ... - } + std::map operatorMap = { + // Addition ======> + {tflite::BuiltinOperator_MAX_POOL_2D, OT_Pooling}, + {tflite::BuiltinOperator_AVERAGE_POOL_2D, OT_Pooling}, + // <====== Addition + }; } ``` @@ -422,13 +422,11 @@ In [model_tools](../model_tools), you can define any operator for model conversi ```c++ virtual EE adapt_operator(OperatorType type, ParameterSpec *ps) { - ... - // Addition ======> - else if (type == OT_Pooling) { - *ps = adapt_Pooling(); - } - // <====== Addition - ... + std::map functions = { + // Addition ======> + {OT_Pooling, &ModelAdaptee::adapt_Pooling}, + // <====== Addition + }; } // Addition ======> @@ -442,19 +440,18 @@ In [model_tools](../model_tools), you can define any operator for model conversi // Addition ======> ParameterSpec adapt_Pooling() override { - ParameterSpec curPs; - memset(&curPs, 0, sizeof(curPs)); - PoolingParamSpec poolingPs; - memset(&poolingPs, 0, sizeof(poolingPs)); - poolingPs.kernel_t = 1; - poolingPs.stride_t = 1; - poolingPs.padding_before = 0; - poolingPs.padding_after = 0; - poolingPs.padding_top = 0; - poolingPs.padding_bottom = 0; - poolingPs.padding_left = 0; - poolingPs.padding_right = 0; - poolingPs.rm = CEIL; + ParameterSpec ps; + PoolingParamSpec p; + memset(&p, 0, sizeof(p)); + p.kernel_t = 1; + p.stride_t = 1; + p.pad_before = 0; + p.pad_after = 0; + p.pad_top = 0; + p.pad_bottom = 0; + p.pad_left = 0; + p.pad_right = 0; + p.round_mode = ROUND_CEIL; const auto &inputTensor = this->tfliteTensors[this->tfliteOperators[this->tfliteOperatorIndex]->inputs[0]]; @@ -466,49 +463,47 @@ In [model_tools](../model_tools), you can define any operator for model conversi const auto &axisData = tfliteModelBuffer[axisTensor->buffer]->data; auto axisPtr = reinterpret_cast(axisData.data()); CHECK_REQUIREMENT(1 == axisPtr[0] && 2 == axisPtr[1]); - poolingPs.mode = POOLING_MEAN; - poolingPs.kernel_h = 0; - poolingPs.kernel_w = 0; - poolingPs.stride_h = 1; - poolingPs.stride_w = 1; + p.mode = POOLING_MEAN; + p.kernel_h = 0; + p.kernel_w = 0; + p.stride_h = 1; + p.stride_w = 1; } else { const auto &tflitePoolOption = this->tfliteOperators[this->tfliteOperatorIndex]->builtin_options.AsPool2DOptions(); - poolingPs.kernel_h = tflitePoolOption->filter_height; - poolingPs.kernel_w = tflitePoolOption->filter_width; - poolingPs.stride_h = tflitePoolOption->stride_h; - poolingPs.stride_w = tflitePoolOption->stride_w; + p.kernel_h = tflitePoolOption->filter_height; + p.kernel_w = tflitePoolOption->filter_width; + p.stride_h = tflitePoolOption->stride_h; + p.stride_w = tflitePoolOption->stride_w; int tfPaddingRoundMode = tflitePoolOption->padding; if (tfPaddingRoundMode == 0) { - poolingPs.rm = TF_SAME; - - int oLength = (inputShape[2] + poolingPs.stride_w - 1) / poolingPs.stride_w; - int padLength = UNI_MAX( - (oLength - 1) * poolingPs.stride_w + poolingPs.kernel_w - inputShape[2], 0); - poolingPs.padding_left = padLength / 2; - poolingPs.padding_right = padLength - poolingPs.padding_left; - - oLength = (inputShape[1] + poolingPs.stride_h - 1) / poolingPs.stride_h; - padLength = UNI_MAX( - (oLength - 1) * poolingPs.stride_h + poolingPs.kernel_h - inputShape[1], 0); - poolingPs.padding_top = padLength / 2; - poolingPs.padding_bottom = padLength - poolingPs.padding_top; + p.round_mode = ROUND_TF_SAME; + + int oLength = (inputShape[2] + p.stride_w - 1) / p.stride_w; + int padLength = UNI_MAX((oLength - 1) * p.stride_w + p.kernel_w - inputShape[2], 0); + p.pad_left = padLength / 2; + p.pad_right = padLength - p.pad_left; + + oLength = (inputShape[1] + p.stride_h - 1) / p.stride_h; + padLength = UNI_MAX((oLength - 1) * p.stride_h + p.kernel_h - inputShape[1], 0); + p.pad_top = padLength / 2; + p.pad_bottom = padLength - p.pad_top; } else if (tfPaddingRoundMode == 1) { - poolingPs.rm = TF_VALID; + p.round_mode = ROUND_TF_VALID; } else { UNI_ERROR_LOG("can not process operator location:%d Pooling round mode.\n", this->tfliteOperatorIndex); } if (opCode == tflite::BuiltinOperator_MAX_POOL_2D) { - poolingPs.mode = POOLING_MAX; + p.mode = POOLING_MAX; } else if (opCode == tflite::BuiltinOperator_AVERAGE_POOL_2D) { - poolingPs.mode = POOLING_MEAN; + p.mode = POOLING_MEAN; } insertActivationOperator( getActivationOperatorType(tflitePoolOption->fused_activation_function)); } - curPs.pooling_spec = poolingPs; - return curPs; + ps.pooling_spec = p; + return ps; } // <====== Addition ``` diff --git a/docs/INSTALL.md b/docs/INSTALL.md index 0379054d..093156f8 100644 --- a/docs/INSTALL.md +++ b/docs/INSTALL.md @@ -57,10 +57,10 @@ - #### Android NDK - Refer to the [NDK installation example](https://askubuntu.com/questions/837847/how-to-install-android-ndk) to install [android-ndk-r20](https://developer.android.google.cn/ndk/downloads) and set shell environment variable **ANDROID_NDK_ROOT**. + Refer to the [NDK installation example](https://askubuntu.com/questions/837847/how-to-install-android-ndk) to install [android-ndk-r22b](https://developer.android.google.cn/ndk/downloads) and set shell environment variable **ANDROID_NDK_ROOT**. ``` - export ANDROID_NDK_ROOT=/data/opt/android-ndk-r20 + export ANDROID_NDK_ROOT=/data/opt/android-ndk-r22b ``` ### Linux-AArch64 Target System Cross-Compilation Tools @@ -143,7 +143,7 @@ We will install Bolt to *install_[target]* directory. These subdirectories will - *benchmark* for measuring inference performance of bolt model These examples will be build when using *--example* option - - *classification* for imagenet classification task,*c_image_classifification* is a simplified C API version + - *classification* for imagenet classification task,*c_image_classification* is a simplified C API version - *u2net* for object detection - *ultra_face* for face detection - *tinybert* and *tinybert_onnx* for tinybert intention identification @@ -176,6 +176,9 @@ We will install Bolt to *install_[target]* directory. These subdirectories will 7. optional. save to *third_party/sources/jpegsrc.v9c.tar.gz* when using example. 8. optional. save to *third_party/sources/ffts-master.zip* when using Flow. 9. optional. save to *third_party/sources/opencv-4.5.2.zip* when using face detection example. + 10. optional. save to *third_party/sources/half-2.2.0.zip* when using on-device training. + 11. optional. save to *third_party/sources/Yato-9b5a49f6ec4169b67b9e5ffd11fdae9c238b0a3d.zip* when using on-device training. + 12. optional. save to *third_party/sources/huawei_secure_c-master.zip* when using Huawei secure C functions. - #### MinGW version error diff --git a/docs/KIT.md b/docs/KIT.md index f11a11b7..009800c8 100644 --- a/docs/KIT.md +++ b/docs/KIT.md @@ -5,6 +5,11 @@         [Android Overview](#android-overview)     [Examples](#examples)         [Image Classification](#image-classification) +        [Camera Enlarge](#camera-enlarge) +        [Semantics Analysis](#semantics-analysis) +        [Chinese Speech Recognition](#chinese-speech-recognition) +        [Face Detection](#face-detection) +        [Reading Comprehension](#reading-comprehension) # Overview --- @@ -31,7 +36,7 @@ In the [kit](../kit) directory, you can find the available demo project. In orde - ### Image Classification -
+
The demo takes video input from camera, and uses [GhostNet](https://github.com/huawei-noah/ghostnet) model trained on ImageNet. Given the same FLOPs, GhostNet shows a clear advantage over other lightweight CNNs. The models that we provide are trained with width as 1.0 on TensorFlow, which reaches a TOP1 accuracy of 74%. @@ -73,15 +78,15 @@ In the [kit](../kit) directory, you can find the available demo project. In orde - ### Camera Enlarge -
+
The demo takes video input from camera, 32 pixels x 32 pixels, and uses [ESR_EA](https://github.com/huawei-noah/vega/blob/master/docs/en/algorithms/esr_ea.md) model to enlarge input image to 64 pixels x 64 pixels. You can easily switch to other models trained on other datasets, following the steps below. As a tutorial, we will show how to change the model to the FP16 ESR_EA that is also included in the project (kit/models). - 0. Similar with Image Classification + 0. Similar with Image Classification. - 1. Similar with Image Classification + 1. Similar with Image Classification. 2. Adjust the pixelProcess function, which is registered as the preprocessing function for the Inference node. For FP16 inference, actual input to the model should be in FP16: @@ -118,9 +123,9 @@ In the [kit](../kit) directory, you can find the available demo project. In orde } ``` -- ### Semantics +- ### Semantics Analysis -
+
The demo tokenize input words, and use [tinybert](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/TinyBERT) model to do senmantic analysis. @@ -160,7 +165,7 @@ In the [kit](../kit) directory, you can find the available demo project. In orde float[][] result = boltResult.getResultData(); ``` - 3. Obtain the analysis result by comparing the size of the two probabilities in the result array + 3. Obtain the analysis result by comparing the size of the two probabilities in the result array. ``` if (result[0][0]>result[0][1]) { @@ -170,17 +175,17 @@ In the [kit](../kit) directory, you can find the available demo project. In orde } ``` -- ### ChineseSpeechRecognition +- ### Chinese Speech Recognition -
+
The demo recognizes the input Chinese speech, and uses the [ASR](https://github.com/huawei-noah/xxx) model to convert Chinese text. You can easily switch to other models trained on other datasets, following the steps below. As a tutorial, we will show how to change the model to the FP32 ASR that is also included in the project. - 0. Call the copyAssetAndWrite method to copy the path, and then change the path of the bin file and bolt model in the prototxt file to the copied path + 0. Call the copyAssetAndWrite method to copy the path, and then change the path of the bin file and bolt model in the prototxt file to the copied path. - 1. Import flow_asr.h in native-lib, flow_asr defines the pre- and post-processing methods and the initialization of flow and the acquisition of results,add init method and get result method in native-lib.cpp + 1. Import flow_asr.h in native-lib, flow_asr defines the pre- and post-processing methods and the initialization of flow and the acquisition of results,add init method and get result method in native-lib.cpp. ``` extern "C" @@ -206,7 +211,7 @@ In the [kit](../kit) directory, you can find the available demo project. In orde } ``` - 2. Call Jni method initFlow + 2. Call Jni method initFlow. ``` initFlow(getCacheDir()+"/encoder_flow.prototxt",getCacheDir()+"/prediction_flow.prototxt", @@ -214,28 +219,67 @@ In the [kit](../kit) directory, you can find the available demo project. In orde ``` - 3. Call Jni method runFlow Incoming audio files in wav format get result + 3. Call Jni method runFlow Incoming audio files in wav format get result. ``` runFlow(wavFileName) ``` -- ### FaceDetection +- ### Face Detection -
+
The demo detects the input picture, and outputs A photo framed a human face. - 0. bolt path get Similar with Semantics + 0. bolt path get Similar with Semantics. - 1. Call the getDetectionImgPath method Bitmap and model path to go directly to the detection result picture path + 1. Call the getDetectionImgPath method Bitmap and model path to go directly to the detection result picture path. ``` resultImgPath=boltResult.getDetectionImgPath(bitmap,boltPath); ``` - 2. The parameters in the prior_boxes_generator method in the jni method initBolt are fixed input parameters of the model and cannot be changed + 2. The parameters in the prior_boxes_generator method in the jni method initBolt are fixed input parameters of the model and cannot be changed. ``` prior_boxes_generator(320,240,0.7,0.3); - ``` \ No newline at end of file + ``` + +- ### Reading Comprehension + +
+ + +The demo is to input a piece of content, and input a content-related question will output the corresponding answer + +0. Call the copyAssetAndWrite method to copy the path, and the model path is used in the BoltModel class. + +1. Incoming content and questions to obtain the input data required by the dynamic library. + + ``` + float[][] tokenizers = appTokenizer.runTokenizer(content.getText().toString(), question.getText().toString()); + ``` + + 2. set the input and output names and other input parameters according to your model to initialize BoltModel. + + ``` + BoltModel boltModel = new BoltModel(modelPath, AffinityType.CPU_HIGH_PERFORMANCE, inputNum, inputName, inputN,inputCMax, inputH, inputW, inputDatatype, inputDataFormat, outputNum, outputName); + BoltResult boltResult = boltModel.run(inputNum, inputName, inputN, inputCActual, inputH, inputW, + inputDatatype, inputDataFormat, tokenizers); + + ``` + + 3. Call the run method of the BoltModel class to obtain the output result. Tokenizers are the processed input data, and inputCActual is the actual length of the input data. Call getResultData of BoltResult class to get the analysis result, get the result array, two float data. + + ``` + BoltResult boltResult = boltModel.run(inputNum, inputName, inputN, inputCActual, inputH, inputW, + inputDatatype, inputDataFormat, tokenizers); + float[][] result = boltResult.getResultData(); + ``` + + 4. Call the getResultAnswer method to get the answer of the output result conversion + + ``` + String resultStr = getResultAnswer(result); + + ``` \ No newline at end of file diff --git a/docs/OPERATORS.md b/docs/OPERATORS.md index 75a15a08..73e0836e 100644 --- a/docs/OPERATORS.md +++ b/docs/OPERATORS.md @@ -12,7 +12,7 @@ | Prelu | prelu activation | | BatchNorm | y = (x - mean) / sqrt(variance + eps) per channel | | LayerNorm | layernorm | -| L2Normalization | L2-Normalization | +| L2Normalization | L2 Normalization | | Reduction | sum, min, max, mean reduction | | ArgMax | max value index | | Softmax | y = exp(x - max(x)) / sum(exp(x - max(x))) | @@ -51,7 +51,7 @@ | PreAllocatedMemory | allocate memory | | SharedWeight | used to represent onnx/tflite operator input that is not generated by another operator | | Copy | memory copy | -| Check | tensor level compare, result is used for Jump | +| Check | element level compare, same as onnx Greater, GreaterOrEqual, Equal, LowerOrEqual, Lower | | Repeat | do while loop for dynamic control flow | | Jump | if statement for dynamic control flow | | Attention | transformer global attention mask | @@ -70,22 +70,35 @@ | Where | onnx where| | SoftPlus | y = log(1 + e ^ x)| | Exp | y = exp(x) | -| Split | y = x | +| OneHot | same as onnx one hot | | Tdnn | Kaldi tdnn operator(Splice + Linear) | | Dropout | dropout function | | TopK | same as onnx topk | | SpaceToBatchNd | tensorflow space_to_batch function | | BatchToSpaceNd | tensorflow batch_to_space function | | Abs | y = (x > 0) ? x : -x | -| Equal | elementwise tensor compare, same as onnx equal, this also support tflite NOT_EQUAL | +| NonZero | same as onnx non zero | | Sign | y = sign(x) | | HSwishNoDiv | y = x * relu6(x + 3) | | InstanceNorm | Instance Normalization | | Expand | onnx expand | | Scatter | onnx scatter, scatter_elements, scatterND | -| Log | y = log(x) | | Select | y = choice ? a : b, same as tflite select | | Not | y = ! (x), same as onnx not | -| RoIAlign | same as onnx RoIAlign | +| Reciprocal | same as onnx reciprocal, y = 1 / x | +| Log | y = log(x) | | GenerateProposals | same as tf tf.image.generate_bounding_box_proposals | -| Reciprocal | same as onnx reciprocal | +| RoIAlign | same as onnx RoIAlign | +| GAT | graph attention module | +| QuantizeLinear | int8 quantization | +| Round | y = round(x) | +| Floor | y = floor(x) | +| Ceil | y = ceil(x) | +| RandomUniform | same as onnx random uniform | +| CumSum | prefix sum, same as onnx cumsum | +| GridSample | same as onnx grid_sample | +| NonMaxSuppression | same as onnx non max suppression | +| Range | same as onnx range | +| Swish | y = x * exp(x) | +| Split | y = x | +| ~~Equal~~ | elementwise tensor compare, same as onnx equal, this also support tflite NOT_EQUAL, Equal is replaced with Check | diff --git a/docs/REDUCE_GPU_PREPARE_TIME.md b/docs/REDUCE_GPU_PREPARE_TIME.md index b3825009..20ff3c8c 100644 --- a/docs/REDUCE_GPU_PREPARE_TIME.md +++ b/docs/REDUCE_GPU_PREPARE_TIME.md @@ -1,25 +1,30 @@ -# How to reduce gpu initial time +# How to reduce gpu inference overhead --- -Bolt support ARM Mali GPU, large addtitional prepare time is cost due to algorithm selecting and building kernel from source code. +Bolt supports ARM GPU inference with OpenCL. +But building OpenCL kernel function from source code and selecting optimal algorithm takes up a lot of time. +They can be optimized by preparing the OpenCL binary function library and algorithm file in advance. +Inference can directly use prepared files. -- ### Build extra resources for reducing prepare time on GPU +- ### Build OpenCL binary kernel library - Bolt provides offline tools [preprocess_ocl](../inference/engine/tools/preprocess_ocl/build_preprocess_ocl.sh) to reduce GPU prepare time. We have test mobilenet_v1 on MALI G76 GPU. Prepare time can be reduced from 2-3s to 60ms after build algorithm file and OpenCL kernel binary. Here we give an exaple: + Bolt provides offline tool [preprocess_ocl](../inference/engine/tools/preprocess_ocl/build_preprocess_ocl.sh) to reduce GPU prepare time. + We have test mobilenet_v1 model on ARM MALI G76 GPU. Prepare time can be reduced from 2-3s to 60ms after building OpenCL binary kernel and algorithm file. + Here we give an example: -- #### Step By Step + - #### Step By Step - <1> Connect target device by Andriod ADB; + <1> Connect target device by using Andriod *ADB*; - <2> Convert your models to .bolt with X2bolt; + <2> Convert your models to xxx.bolt by using *X2bolt*; - <3> Make a write/read able folder on target device, copy all your needed .bolt models into it, E.g: + <3> Create a directory on target device, copy all your needed xxx.bolt models into it, E.g: ``` adb shell "mkdir /data/local/tmp/preprocess_bolt_models" adb shell "cp ${boltModelDir}/*.bolt /data/local/tmp/preprocess_bolt_models" ``` - <4> Set essential variables for sh */inference/engine/tools/preproces_ocl/build_preprocess_ocl.sh*: + <4> Set essential command line arguments for shell script [preprocess_ocl](../inference/engine/tools/preprocess_ocl/build_preprocess_ocl.sh): - dNum: Device serial number, which can be aquired by using command @@ -30,32 +35,36 @@ Bolt support ARM Mali GPU, large addtitional prepare time is cost due to algorit - device_bolt_models: which is created in step <3>; for example: + ``` - ./build_preprocess_ocl.sh --device dNum --target android-aarch64 -d device_bolt_models + ./build_preprocess_ocl.sh --device 435bc850 --target android-aarch64 -d /data/local/tmp/preprocess_bolt_models ``` <5> Run *build_preprocess_ocl.sh* on host; - After running build_preprocess_ocl.sh successfully, these extra xxxlib.so will be produced: - - - OpenCL kernel bin dynamic library: All needed kernels for your model has been compiled from sources to bins, and package into .so, such as: *${BOLT_ROOT}/inference/engine/tools/preprocess_ocl/lib/libMali_G76p_map.so* + After running build_preprocess_ocl.sh successfully, OpenCL binary kernel shared library libxxx_map.so will be produced. + All needed kernels for your models has been compiled from sources to bins, + and packaged into libxxx_map.so, such as *${BOLT_ROOT}/inference/engine/tools/preprocess_ocl/lib/libMali_G76p_map.so* + +- ### Use OpenCL binary kernel library to reduce gpu prepare time for your model -- ### Use algorithm file and kernel binary dynamic library to reduce gpu prepare time for your model + - #### Reduce Imagenet classification prepare time -- #### Reduce Imagenet classification prepare time - ``` - adb shell "mkdir /data/local/tmp/kits" - adb push install_arm_llvm/kits/classification /data/local/tmp/kits - adb push tools/preprocess_ocl/lib/libMali_G76p_map.so /data/local/tmp/kits - adb shell "cd /data/local/tmp/kits && export LD_LIBRARY_PATH=./ && ./classification -m ./mobilenet_v1_f16.bolt -a GPU" - ``` + ``` + adb shell "mkdir /data/local/tmp/kits" + adb push install_arm_llvm/kits/classification /data/local/tmp/kits + adb push tools/preprocess_ocl/lib/libMali_G76p_map.so /data/local/tmp/kits + adb shell "cd /data/local/tmp/kits && export LD_LIBRARY_PATH=./ && ./classification -m ./mobilenet_v1_f16.bolt -a GPU" + ``` -- #### Reduce C project prepare time + - #### Reduce C project prepare time - - Argument *algoFileStream* of C API *ModelHandle CreateModelWithFileStream( const char *modelFileStream, AFFINITY_TYPE affinity, const char *algoFileStream)* is used to set your algofile filestream; - - Package kernel binary dynamic library into your project; + Package kernel binary dynamic library into your project, and put it in *libbolt.so* directory. - ### Note - - Kernel binary dynamic library are binding with specific GPU type and your bolt models; - - Please run it under file path "/data/local/tmp" for android devices to ensure the program get full authorities; - - Argument *algoPath* of C API *ModelHandle CreateModel(const char *modelPath, AFFINITY_TYPE affinity, const char *algoPath)* is abandoned, for now algoInfo has been packaged into xxxlib.so, please set it *NULL*; + - OpenCL kernel functions are stored in the shared library libxxx_map.so in binary form. + Shared library libxxx_map.so is binding with specific GPU type and bolt models. + Bolt will use C system function *dlopen* to open shared library libxxx_map.so, please save it in same directory. + - Please run prepare program under */data/local/tmp* directory for android devices to ensure the program has write permission. + - Argument *algoPath* of C API *ModelHandle CreateModel(const char \*modelPath, AFFINITY_TYPE affinity, const char \*algoPath)* is abandoned in latest version, + algorithm file has been packaged into libxxx_map.so, please set it to *NULL*. \ No newline at end of file diff --git a/docs/USER_HANDBOOK.md b/docs/USER_HANDBOOK.md index 8ebb039d..630488c3 100644 --- a/docs/USER_HANDBOOK.md +++ b/docs/USER_HANDBOOK.md @@ -6,25 +6,20 @@ Before you try any step described in this document, please make sure you have in         [Model Conversion](#model-conversion)         [Model Inference](#model-inference)         [API](#api) -        [Performance Profiling](#performance-profiling) +        [Performance Profiling](#performance-profiling) +        [Model Visualization](#model-visualization) +        [Model Protection](#model-protection) +        [Environment variables](#environment-variables)     [Advanced Features](#advanced-features)         [INT8 Post Training Quantization](#int8-post-training-quantization)         [BNN Network Support](#bnn-network-support)         [Algorithm Tuning for Key Layers](#algorithm-tuning-for-key-layers)         [Time-Series Data Acceleration](#time-series-data-acceleration) +        [How to reduce gpu inference overhead](#how-to-reduce-gpu-inference-overhead) # Basic Usage --- -### Environment variables - -Some Linux shell environment variables are reserved for Bolt. - -- *BOLT_MEMORY_REUSE_OPTIMIZATION*: whether to use memory reuse optimization. The default value is ON, You can set it *OFF* before model conversion to disable memory reuse optimization. Note that this setting takes effect during the model conversion. Once the model (.bolt) is stored, the memory reuse behavior is fixed. -- *BOLT_PADDING*: Bolt only supports RNN/GRU/LSTM hidden states number mod 32 = 0 case, If you want to run number mod 32 != 0 case, please set it to *ON* before model conversion. The default value is ON. -- *BOLT_INT8_STORAGE_ERROR_THRESHOLD*: Bolt supports storage precision and computation precision independent. You can use int8 model storage, FP32/FP16 computation. There will be a huge accuracy error when you quantize all float weight to int8 storage. So we provide a configure parameter to control only quantize < *BOLT_INT8_STORAGE_ERROR_THRESHOLD* weight. -- *Bolt_TensorComputing_LibraryAlgoritmMap*: a path on the target device set by user to save tensor_computing library performance tuning result. - ### Model Conversion
@@ -40,10 +35,12 @@ Some Linux shell environment variables are reserved for Bolt. * [X2bolt](../model_tools/tools/X2bolt/X2bolt.cpp) is a general converter, which focuses on converting different deep learning model to bolt model. -Here we list the examples of two typical model conversions for Android backend, for X86 backend the ADB tool is not required. +*Here we list the examples of two typical model conversions for Android backend, for X86 backend the ADB tool is not required.* #### Caffe/ONNX/Tflite Model Conversion +Here we give an example of Caffe model conversion. ONNX and Tflite Model Conversions are similar to Caffe. The only difference is the suffix and number of model files. **If you want to convert ONNX model, you would better simplify ONNX model with [onnx-sim](https://github.com/daquexian/onnx-simplifier)**. + resnet50(caffe) model contains two model files : [resnet50.prototxt](https://github.com/KaimingHe/deep-residual-networks/blob/master/prototxt/ResNet-50-deploy.prototxt) and [resnet50.caffemodel](https://deepdetect.com/models/resnet/ResNet-50-model.caffemodel). Prepare these two model files on */home/resnet/* in advance. 1. Push your model to the phone; @@ -66,20 +63,22 @@ resnet50(caffe) model contains two model files : [resnet50.prototxt](https://git adb shell "./X2bolt --help" ``` -3. Execute ***X2bolt*** to convert a model from caffe model to bolt model. Here shows the example of float16 model conversion. +3. Execute ***X2bolt*** to convert a model from caffe model to bolt model. Here shows the example of float32 model conversion. ``` - adb shell "/data/local/tmp/bolt/tools/X2bolt -d /data/local/tmp/models/resnet50/ -m resnet50 -i FP16" + adb shell "/data/local/tmp/bolt/tools/X2bolt -d /data/local/tmp/models/resnet50/ -m resnet50 -i FP32" adb shell "ls /data/local/tmp/models/resnet50" - # command output$ resnet50_fp16.bolt + # command output$ resnet50_fp32.bolt ``` Note : Model conversion procedure of onnx and tflite is similar to caffe. #### Tensorflow Model Conversion -Save your mobilenet_v1 to frozen .pb model. And preprocess your model using [tf2json](../model_tools/tools/tensorflow2json/tf2json.py) which can convert the .pb to .json. Then use **X2bolt** to convert .json to .bolt model. +Save your mobilenet_v1 to frozened .pb model. +Preprocess .pb model using [tf2json](../model_tools/tools/tensorflow2json/tf2json.py) which can convert the .pb to .json. +Convert .json to .bolt model with **X2bolt**. Here is the example of mobilenet_v1_frozen.pb converted to mobilenet_v1.bolt. @@ -322,6 +321,36 @@ Bolt provides a program performance visualization interface to help user identif 4. Use Google Chrome browser to open extension. Load the JSON file. You can see the program execution time. ![](images/PerformanceProfiling.PNG) +### Model Visualization + +Bolt provides two ways to see model structure. + +- #### Using **-V** option in X2bolt or post_training_quantization to print model structure + +
+ +- #### [Using netron to visualise bolt model](https://github.com/huawei-noah/bolt/issues/97) + +
+ + +### Model Protection + +If you don't want others to know your model structure, you can follow these steps to achieve goal. + +1. modify enum type *OperatorType*'s order and *OperatorTypeName* function in [common/uni/include/operator_type.h](common/uni/include/operator_type.h). +2. set cmake option *USE_MODEL_PRINT* to *OFF* in [common/cmakes/bolt.cmake](common/cmakes/bolt.cmake). + +### Environment variables + +Some Linux shell environment variables are reserved for Bolt. + +- *BOLT_MEMORY_REUSE_OPTIMIZATION*: whether to use memory reuse optimization. The default value is ON, You can set it *OFF* before model conversion to disable memory reuse optimization. Note that this setting takes effect during the model conversion. Once the model (.bolt) is stored, the memory reuse behavior is fixed. +- *BOLT_PADDING*: Bolt only supports RNN/GRU/LSTM hidden states number mod 32 = 0 case, If you want to run number mod 32 != 0 case, please set it to *ON* before model conversion. The default value is ON. +- *BOLT_INT8_STORAGE_ERROR_THRESHOLD*: Bolt supports storage precision and computation precision independent. You can use int8 model storage, FP32/FP16 computation. There will be a huge accuracy error when you quantize all float weight to int8 storage. So we provide a configure parameter to control only quantize < *BOLT_INT8_STORAGE_ERROR_THRESHOLD* weight. +- *Bolt_TensorComputing_LibraryAlgoritmMap*: a path on the target device set by user to save tensor_computing library performance tuning result. + + # Advanced Features --- @@ -368,3 +397,10 @@ Flow is the time-series data acceleration module for Bolt. Flow simplifies the a Flow provides flexible CPU multi-core parallelism and heterogeneous scheduling (CPU + GPU). User don't need to pay excessive attention to heterogeneous management and write lots of non-reusable code to implement a heterogeneous application. User can get the best end-to-end performance with the help of Flow. Flow supports data parallelism and subgraph parallelism, with a simple API. More usage information can be find in [DEVELOPER.md](./DEVELOPER.md#time-series-data-acceleration-by-using-flow). + +### How to reduce gpu inference overhead + +Bolt support ARM GPU inference with OpenCL, but there are a big overhead that is caused by compiling OpenCL kernel source code and selecting optimal algorithm. + +They can be optimized by preparing some files in advance. Inference can directly use prepared files. +You can refer [REDUCE_GPU_PREPARE_TIME.md](./REDUCE_GPU_PREPARE_TIME.md) for more details. diff --git a/docs/images/ChineseSpeechRecognition.PNG b/docs/images/ChineseSpeechRecognition.PNG deleted file mode 100644 index 66a6a8af..00000000 Binary files a/docs/images/ChineseSpeechRecognition.PNG and /dev/null differ diff --git a/docs/images/FaceDetection.PNG b/docs/images/FaceDetection.PNG deleted file mode 100644 index 85a93c96..00000000 Binary files a/docs/images/FaceDetection.PNG and /dev/null differ diff --git a/docs/images/Framework.PNG b/docs/images/Framework.PNG index a831acd8..109d9496 100644 Binary files a/docs/images/Framework.PNG and b/docs/images/Framework.PNG differ diff --git a/docs/images/ImageClassification.PNG b/docs/images/ImageClassification.PNG deleted file mode 100644 index 3603c9be..00000000 Binary files a/docs/images/ImageClassification.PNG and /dev/null differ diff --git a/docs/images/ReadingComprehension.gif b/docs/images/ReadingComprehension.gif new file mode 100644 index 00000000..bb695603 Binary files /dev/null and b/docs/images/ReadingComprehension.gif differ diff --git a/docs/images/Semantics.PNG b/docs/images/Semantics.PNG deleted file mode 100644 index fd299bad..00000000 Binary files a/docs/images/Semantics.PNG and /dev/null differ diff --git a/docs/images/X2bolt.PNG b/docs/images/X2bolt.PNG new file mode 100644 index 00000000..a57ca6aa Binary files /dev/null and b/docs/images/X2bolt.PNG differ diff --git a/docs/images/losses_of_training_lenet.PNG b/docs/images/losses_of_training_lenet.PNG new file mode 100644 index 00000000..68c34a70 Binary files /dev/null and b/docs/images/losses_of_training_lenet.PNG differ diff --git a/docs/images/losses_of_training_mobilenet.PNG b/docs/images/losses_of_training_mobilenet.PNG new file mode 100644 index 00000000..175bc606 Binary files /dev/null and b/docs/images/losses_of_training_mobilenet.PNG differ diff --git a/docs/images/losses_of_training_resnet.PNG b/docs/images/losses_of_training_resnet.PNG new file mode 100644 index 00000000..c8648f91 Binary files /dev/null and b/docs/images/losses_of_training_resnet.PNG differ diff --git a/docs/images/netron.PNG b/docs/images/netron.PNG new file mode 100644 index 00000000..e7545fa3 Binary files /dev/null and b/docs/images/netron.PNG differ diff --git a/inference/engine/api/c/bolt.h b/inference/engine/api/c/bolt.h index b729578a..bb51e422 100644 --- a/inference/engine/api/c/bolt.h +++ b/inference/engine/api/c/bolt.h @@ -33,9 +33,10 @@ typedef void *ResultHandle; /** CPU affinity policy */ typedef enum { - CPU_HIGH_PERFORMANCE = 0, ///< performance is high priority(use big core) - CPU_LOW_POWER = 1, ///< power is high priority(use small core) - GPU = 2 ///< use GPU + CPU = 0, ///< don't bind process to specific core + CPU_HIGH_PERFORMANCE = 1, ///< performance is high priority(use big core) + CPU_LOW_POWER = 2, ///< power is high priority(use small core) + GPU = 3 ///< use GPU } AFFINITY_TYPE; /** heterogeneous device type */ @@ -60,28 +61,20 @@ typedef enum { } DATA_TYPE; /** Get DATA_TYPE String */ -inline const char *const *GetDataTypeString() -{ - static const char *const names[] = {"FP_32", "FP_16", "INT_32", "UINT_32"}; - return names; -} +const char *const *GetDataTypeString(); /** multi-dimension data format */ typedef enum { NCHW = 0, ///< batch->channel->high->width data order NHWC = 1, ///< batch->high->width->channel data order - NCHWC8 = 2, ///< batch->channel/8->high->width->channel four element data order + NCHWC8 = 2, ///< batch->channel/8->high->width->channel eight element data order MTK = 3, ///< batch->time->unit data order NORMAL = 4, ///< batch->unit data order NCHWC4 = 5 ///< batch->channel/4->width->high->channel four element data order } DATA_FORMAT; /** Get DATA_FORMAT String */ -inline const char *const *GetDataFormatString() -{ - static const char *const names[] = {"NCHW", "NHWC", "NCHWC8", "MTK", "NORMAL"}; - return names; -} +const char *const *GetDataFormatString(); /** * @brief create model from file @@ -174,6 +167,12 @@ void PrepareModel(ModelHandle ih, * @param ih inference pipeline handle * * @return result data memory handle + * @note destroy result when unused + * @code + * ResultHandle result = AllocAllResultHandle(...); + * ... + * FreeResultHandle(result); + * @endcode */ ResultHandle AllocAllResultHandle(ModelHandle ih); @@ -213,6 +212,11 @@ int GetNumOutputsFromResultHandle(ResultHandle ir); * @note * name/n/c/h/w/dt/df array space must be allocated before calling, the array length must be equal to num_inputs. * each element of name must be allocated, the array length must be equal to 128. + * GetOutputDataInfoFromResultHandle must behind RunModel because RunModel will change ResultHandle. + * @code + * RunModel(...); + * GetOutputDataInfoFromResultHandle(...); + * @endcode */ void GetOutputDataInfoFromResultHandle(ResultHandle ir, int num_outputs, @@ -231,6 +235,11 @@ void GetOutputDataInfoFromResultHandle(ResultHandle ir, * @param data the array of all output data's content * * @return + * @note GetOutputDataFromResultHandle must behind RunModel because RunModel will change ResultHandle. + * @code + * RunModel(...); + * GetOutputDataFromResultHandle(...); + * @endcode */ void GetOutputDataFromResultHandle(ResultHandle ir, int num_outputs, void **data); @@ -337,6 +346,11 @@ void ResizeModelInput(ModelHandle ih, * @param name the array of tesor name that needed * * @return result data memory handle + * @code + * ResultHandle result = AllocSpecificResultHandle(...); + * ... + * FreeResultHandle(result); + * @endcode */ ResultHandle AllocSpecificResultHandle(ModelHandle ih, int num_outputs, const char **name); @@ -399,6 +413,15 @@ void SetRuntimeDeviceDynamic(ModelHandle ih); * @return */ void SetNumThreads(int threads); + +/** + * @brief check memory leak + * + * @note + * This can only be used at the end of program after Model and Result free. + * @return + */ +void MemoryCheck(); #ifdef __cplusplus } #endif diff --git a/inference/engine/api/java/com/huawei/noah/AffinityType.java b/inference/engine/api/java/com/huawei/noah/AffinityType.java index 4cfe0704..5c458851 100644 --- a/inference/engine/api/java/com/huawei/noah/AffinityType.java +++ b/inference/engine/api/java/com/huawei/noah/AffinityType.java @@ -22,6 +22,7 @@ /** affinity policy */ public enum AffinityType { + CPU, ///< don't bind process to specific core CPU_HIGH_PERFORMANCE, ///< performance is high priority(use CPU big core) CPU_LOW_POWER, ///< power is high priority(use CPU small core) GPU ///< use ARM MALI GPU diff --git a/inference/engine/include/activation.hpp b/inference/engine/include/activation.hpp index 293f7119..bdfeada6 100644 --- a/inference/engine/include/activation.hpp +++ b/inference/engine/include/activation.hpp @@ -18,9 +18,9 @@ class Activation : public Operator { public: - Activation(ActivationParamSpec activationDesc) + Activation(ActivationParamSpec p) { - this->activationDesc = activationDesc; + this->p = p; std::map activationMap = {{ACTIVATION_RELU, OT_Relu}, {ACTIVATION_RELU6, OT_Relu6}, {ACTIVATION_H_SWISH, OT_HSwish}, {ACTIVATION_H_SWISH_NODIV, OT_HSwishNoDiv}, {ACTIVATION_SIGMOID, OT_Sigmoid}, @@ -28,13 +28,14 @@ class Activation : public Operator { {ACTIVATION_TANH, OT_TanH}, {ACTIVATION_MISH, OT_Mish}, {ACTIVATION_GREATER, OT_Greater}, {ACTIVATION_EXP, OT_Exp}, {ACTIVATION_SOFTPLUS, OT_SoftPlus}, {ACTIVATION_ABS, OT_Abs}, {ACTIVATION_SIGN, OT_Sign}, {ACTIVATION_NOT, OT_Not}, {ACTIVATION_LOG, OT_Log}, - {ACTIVATION_NEG, OT_Neg}}; - if (activationMap.find(activationDesc.mode) == activationMap.end()) { - UNI_ERROR_LOG("can not map ActivationMode to OperatorType.\n"); + {ACTIVATION_NEG, OT_Neg}, {ACTIVATION_ROUND, OT_Round}, {ACTIVATION_FLOOR, OT_Floor}, + {ACTIVATION_CEIL, OT_Ceil}, {ACTIVATION_SWISH, OT_Swish}, + {ACTIVATION_RECIPROCAL, OT_Reciprocal}}; + if (activationMap.find(p.mode) == activationMap.end()) { + UNI_ERROR_LOG("can not map ActivationMode(%d) to OperatorType.\n", p.mode); } else { - this->opt = activationMap[activationDesc.mode]; + this->opt = activationMap[p.mode]; } - this->lenOfTemp = 0; } OperatorType get_type() override @@ -48,7 +49,7 @@ class Activation : public Operator { } protected: - ActivationParamSpec activationDesc; + ActivationParamSpec p; OperatorType opt; }; diff --git a/inference/engine/include/attention.hpp b/inference/engine/include/attention.hpp index 87892b81..a992be7f 100644 --- a/inference/engine/include/attention.hpp +++ b/inference/engine/include/attention.hpp @@ -15,7 +15,6 @@ #define _ATTENTION_H #include "operator.hpp" -#include "tensor_computing.h" class Attention : public Operator { public: diff --git a/inference/engine/include/batch_norm.hpp b/inference/engine/include/batch_norm.hpp index 4dba830b..6f762f97 100644 --- a/inference/engine/include/batch_norm.hpp +++ b/inference/engine/include/batch_norm.hpp @@ -22,7 +22,6 @@ class BatchNorm : public WeightOperator { { this->dt = dt; this->p = p; - this->numChannels = 0; } OperatorType get_type() override @@ -32,7 +31,6 @@ class BatchNorm : public WeightOperator { protected: BatchNormParamSpec p; - U32 numChannels; }; #endif // _BATCH_NORM_H diff --git a/inference/engine/include/cast.hpp b/inference/engine/include/cast.hpp index 318422e7..78d84b72 100644 --- a/inference/engine/include/cast.hpp +++ b/inference/engine/include/cast.hpp @@ -29,11 +29,6 @@ class Cast : public Operator { return OT_Cast; } - bool can_input_output_the_same() override - { - return false; - } - public: CastParamSpec p; }; diff --git a/inference/engine/include/check.hpp b/inference/engine/include/check.hpp index 2c8ce0b0..f07dbbc2 100644 --- a/inference/engine/include/check.hpp +++ b/inference/engine/include/check.hpp @@ -14,9 +14,9 @@ #ifndef _CHECK_H #define _CHECK_H -#include "operator.hpp" +#include "weight_operator.hpp" -class Check : public Operator { +class Check : public WeightOperator { public: Check(DataType dt, CheckParamSpec p) { diff --git a/inference/engine/include/cnn.h b/inference/engine/include/cnn.h index c9b03f4b..b320c344 100644 --- a/inference/engine/include/cnn.h +++ b/inference/engine/include/cnn.h @@ -82,6 +82,8 @@ class CNN : public Model { void update_op_tensors(); + void update_tensor_positions(); + void set_input_desc(std::map inputDescMap); void infer_tmp_memory_size() override; @@ -98,6 +100,8 @@ class CNN : public Model { void clean_tensorMap_desc(); + void check_dynamic_output_size(OperatorType type); + private: std::map> tensorMap; std::map> operatorMap; @@ -116,5 +120,6 @@ class CNN : public Model { #ifdef _USE_GPU ImageContainer tmpImages; #endif + bool dynamicOutputSize = false; }; #endif diff --git a/inference/engine/include/constant.hpp b/inference/engine/include/constant.hpp index 93c3e344..d58cd1bd 100644 --- a/inference/engine/include/constant.hpp +++ b/inference/engine/include/constant.hpp @@ -40,7 +40,7 @@ class Constant : public Operator { { Tensor outputTensor = this->outputTensors[0]; auto outputPtr = ((CpuMemory *)outputTensor.get_memory())->get_ptr(); - memcpy(outputPtr, data, tensorNumBytes(constDesc)); + UNI_MEMCPY(outputPtr, data, tensorNumBytes(constDesc)); } EE infer_output_tensors_size(std::vector *outDims) override diff --git a/inference/engine/include/constant_of_shape.hpp b/inference/engine/include/constant_of_shape.hpp new file mode 100644 index 00000000..5ed4eb21 --- /dev/null +++ b/inference/engine/include/constant_of_shape.hpp @@ -0,0 +1,35 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CONSTANT_OF_SHAPE_H +#define _CONSTANT_OF_SHAPE_H + +#include "operator.hpp" + +class ConstantOfShape : public Operator { +public: + explicit ConstantOfShape(DataType dt, ConstantOfShapeParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_ConstantOfShape; + } + +protected: + ConstantOfShapeParamSpec p; +}; +#endif // _CONSTANT_OF_SHAPE_H diff --git a/inference/engine/include/cpu/activation_cpu.hpp b/inference/engine/include/cpu/activation_cpu.hpp index c598c142..7294b264 100644 --- a/inference/engine/include/cpu/activation_cpu.hpp +++ b/inference/engine/include/cpu/activation_cpu.hpp @@ -18,13 +18,13 @@ class ActivationCPU : public Activation { public: - ActivationCPU(ActivationParamSpec activationDesc) : Activation(activationDesc) + ActivationCPU(ActivationParamSpec p) : Activation(p) {} std::shared_ptr clone() override { std::shared_ptr mem = - std::shared_ptr(new ActivationCPU(this->activationDesc)); + std::shared_ptr(new ActivationCPU(this->p)); *mem = *this; return mem; } @@ -33,15 +33,14 @@ class ActivationCPU : public Activation { { Tensor inputTensor = this->inputTensors[0]; Tensor outputTensor = this->outputTensors[0]; - CHECK_STATUS(activation(inputTensor, this->activationDesc, outputTensor, &this->archInfo)); + CHECK_STATUS(activation(inputTensor, this->p, outputTensor, &this->archInfo)); outputTensor.set_scale(inputTensor.get_scale()); } EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { - CHECK_STATUS(activation_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); - return SUCCESS; + return activation_infer_output_size(inTensors[0], outTensors[0], &this->archInfo); } }; diff --git a/inference/engine/include/cpu/batch_norm_cpu.hpp b/inference/engine/include/cpu/batch_norm_cpu.hpp index 651f962d..24f11d20 100644 --- a/inference/engine/include/cpu/batch_norm_cpu.hpp +++ b/inference/engine/include/cpu/batch_norm_cpu.hpp @@ -38,36 +38,29 @@ class BatchNormCPU : public BatchNorm { EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { - auto inputDesc = inTensors[0]->get_desc(); - this->set_channels_from_weight(); - TensorDesc outputDesc = inputDesc; - //if (outputDesc.nDims == 3 && this->p.axis == -1 && outputDesc.dims[0] == this->numChannels) { - // outputDesc.df = DF_NHWC; - //} - outTensors[0]->resize(outputDesc); + outTensors[0]->resize(inTensors[0]->get_desc()); return SUCCESS; } - void set_channels_from_weight() + int get_channels_num() { + int ret = 0; auto curOpWs = this->get_weightspec(); if (0 != curOpWs.bytes_of_weight) { - this->numChannels = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); + ret = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); } else if (0 != curOpWs.bytes_of_vec) { - this->numChannels = curOpWs.bytes_of_vec / UNI_MAX(1, bytesOf(curOpWs.mdt)); - } else { - this->numChannels = 0; + ret = curOpWs.bytes_of_vec / UNI_MAX(1, bytesOf(curOpWs.mdt)); } + return ret; } EE infer_weight_desc() override { - // weight is mean, bias is variance - this->set_channels_from_weight(); + int num = this->get_channels_num(); this->weightTensors = std::vector(1); - this->weightTensors[0].resize(tensor1d(this->dt, this->numChannels)); + this->weightTensors[0].resize(tensor1d(this->dt, num)); this->biasTensors = std::vector(1); - this->biasTensors[0].resize(tensor1d(this->dt, this->numChannels)); + this->biasTensors[0].resize(tensor1d(this->dt, num)); return SUCCESS; } diff --git a/inference/engine/include/cpu/cast_cpu.hpp b/inference/engine/include/cpu/cast_cpu.hpp index 4a7c74bd..544795b3 100644 --- a/inference/engine/include/cpu/cast_cpu.hpp +++ b/inference/engine/include/cpu/cast_cpu.hpp @@ -30,13 +30,13 @@ class CastCPU : public Cast { void run() override { - CHECK_STATUS(cast(this->inputTensors[0], this->outputTensors[0], this->p, &this->archInfo)); + CHECK_STATUS(cast(this->inputTensors[0], this->p, this->outputTensors[0], &this->archInfo)); } EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { - CHECK_STATUS(cast_infer_output_size(inTensors[0], outTensors[0], this->p, &this->archInfo)); + CHECK_STATUS(cast_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); return SUCCESS; } }; diff --git a/inference/engine/include/cpu/channel_resize_cpu.hpp b/inference/engine/include/cpu/channel_resize_cpu.hpp index cf5d6c5f..8fa0789c 100644 --- a/inference/engine/include/cpu/channel_resize_cpu.hpp +++ b/inference/engine/include/cpu/channel_resize_cpu.hpp @@ -43,7 +43,7 @@ class ChannelResizeCPU : public ChannelResize { if (!this->valid) { if (inputPtr != outputPtr) { CHECK_REQUIREMENT(inputSize == outputSize); - memcpy(outputPtr, inputPtr, inputSize); + UNI_MEMCPY(outputPtr, inputPtr, inputSize); } } else if (this->rearrange && DF_NCHWC8 == inputDesc.df && DF_NCHWC8 == outputDesc.df) { transformNCHWC8ToNCHWC8ByGroup( diff --git a/inference/engine/include/cpu/check_cpu.hpp b/inference/engine/include/cpu/check_cpu.hpp index 464721f9..92599660 100644 --- a/inference/engine/include/cpu/check_cpu.hpp +++ b/inference/engine/include/cpu/check_cpu.hpp @@ -31,7 +31,12 @@ class CheckCPU : public Check { void run() override { Tensor inputATensor = this->inputTensors[0]; - Tensor inputBTensor = this->inputTensors[1]; + Tensor inputBTensor; + if (this->weightTensors.size() > 0) { + inputBTensor = this->weightTensors[0]; + } else { + inputBTensor = this->inputTensors[1]; + } Tensor outputTensor = this->outputTensors[0]; CHECK_STATUS(check(inputATensor, inputBTensor, this->p, outputTensor, &this->archInfo)); } @@ -41,6 +46,17 @@ class CheckCPU : public Check { { return check_infer_output_size(inTensors, outTensors[0], &this->archInfo); } + + EE infer_weight_desc() override + { + auto curOpWs = this->get_weightspec(); + if (curOpWs.bytes_of_weight > 0) { + this->weightTensors = std::vector(1); + this->weightTensors[0].resize( + tensor2d(curOpWs.mdt, 1, curOpWs.bytes_of_weight / bytesOf(curOpWs.mdt))); + } + return SUCCESS; + } }; #endif // _CHECK_CPU_H diff --git a/inference/engine/include/cpu/concat_cpu.hpp b/inference/engine/include/cpu/concat_cpu.hpp index b758c63d..3cddb585 100644 --- a/inference/engine/include/cpu/concat_cpu.hpp +++ b/inference/engine/include/cpu/concat_cpu.hpp @@ -44,7 +44,8 @@ class ConcatCPU : public Concat { U32 infer_tmp_memory_size() override { U32 bytes = 0; - CHECK_STATUS(concat_infer_forward_tmp_bytes(this->inputTensors, &bytes, &this->archInfo)); + CHECK_STATUS(concat_infer_forward_tmp_bytes( + this->inputTensors, this->outputTensors[0], &bytes, &this->archInfo)); return bytes; } }; diff --git a/inference/engine/include/cpu/constant_of_shape_cpu.hpp b/inference/engine/include/cpu/constant_of_shape_cpu.hpp new file mode 100644 index 00000000..6e2f2f73 --- /dev/null +++ b/inference/engine/include/cpu/constant_of_shape_cpu.hpp @@ -0,0 +1,55 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CONSTATNT_OF_SHAPE_CPU_H +#define _CONSTATNT_OF_SHAPE_CPU_H + +#include "constant_of_shape.hpp" + +class ConstantOfShapeCPU : public ConstantOfShape { +public: + ConstantOfShapeCPU(DataType dt, ConstantOfShapeParamSpec p) : ConstantOfShape(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new ConstantOfShapeCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + TensorDesc desc = this->outputTensors[0].get_desc(); + UNI_INIT(tensorNumElements(desc), desc.dt, this->p.value, + ((CpuMemory *)(this->outputTensors[0].get_memory()))->get_ptr()); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + TensorDesc inDesc = inTensors[0]->get_desc(); + TensorDesc outDesc; + outDesc.dt = this->p.dt; + outDesc.nDims = inDesc.dims[0]; + outDesc.df = getTensorDefaultDataFormat(outDesc.nDims); + for (U32 i = 0; i < outDesc.nDims; i++) { + outDesc.dims[i] = inDesc.dims[inDesc.nDims + inDesc.dims[0] - 1 - i]; + } + outTensors[0]->resize(outDesc); + return SUCCESS; + } +}; + +#endif // CONSTATNT_OF_SHAPE_CPU_H diff --git a/inference/engine/include/cpu/convolution_cpu.hpp b/inference/engine/include/cpu/convolution_cpu.hpp index cc5003d1..176a1735 100644 --- a/inference/engine/include/cpu/convolution_cpu.hpp +++ b/inference/engine/include/cpu/convolution_cpu.hpp @@ -33,17 +33,6 @@ class ConvolutionCPU : public Convolution { return mem; } - DataType get_float_precision() - { - DataType ret = this->dt; - if (this->dt == DT_F16_8Q) { - ret = DT_F16; - } else if (this->dt == DT_F32_8Q) { - ret = DT_F32; - } - return ret; - } - EE init_weight_bias_from_model(std::shared_ptr *modelPtrShared) override { U8 *modelPtr = nullptr; @@ -55,12 +44,21 @@ class ConvolutionCPU : public Convolution { if (modelPtr != nullptr) { filterDt = this->dt; } - DataType dtNoQ = this->get_float_precision(); + DataType dtNoQ = (dt == DT_F16_8Q) ? DT_F16 : ((dt == DT_F32_8Q) ? DT_F32 : dt); U32 isBNN = 0; if (filterDt == DT_BIN01 || filterDt == DT_BIN11) { isBNN = 1; } + if (curOpWs.num_quant_scale == this->weightTensors.size()) { + for (U32 i = 0; i < this->weightTensors.size(); ++i) { + if (curOpWs.weight_scale[i].num_scale > 0) { + this->weightTensors[i].set_scale_ptr( + std::shared_ptr(curOpWs.weight_scale[i].scale, [](F32 *) {})); + } + } + } + for (U32 i = 0; i < this->weightTensors.size(); i++) { TensorDesc desc = this->weightTensors[i].get_desc(); desc.dt = filterDt; @@ -69,7 +67,7 @@ class ConvolutionCPU : public Convolution { for (U32 i = 0; i < this->biasTensors.size(); i++) { TensorDesc desc = this->biasTensors[i].get_desc(); desc.dt = dtNoQ; - if (this->p.convolution_type == Convolution_Pointwise) { + if (this->p.convolution_type == CONVOLUTION_POINTWISE) { U32 vectorLen = this->p.num_outputs; // bias length if (isBNN == 1) { this->dt = dtNoQ; // BNN convolution should not be quantized further @@ -88,12 +86,12 @@ class ConvolutionCPU : public Convolution { U32 offset_bytes = 0; if (modelPtr != nullptr) { this->weightTensors[j].alloc(); - memcpy(((CpuMemory *)(this->weightTensors[j].get_memory()))->get_ptr(), modelPtr, - weight_bytes); + UNI_MEMCPY(((CpuMemory *)(this->weightTensors[j].get_memory()))->get_ptr(), + modelPtr, weight_bytes); offset_bytes += weight_bytes; if (this->hasBias) { this->biasTensors[j].alloc(); - memcpy(((CpuMemory *)(this->biasTensors[j].get_memory()))->get_ptr(), + UNI_MEMCPY(((CpuMemory *)(this->biasTensors[j].get_memory()))->get_ptr(), modelPtr + offset_bytes, bias_bytes); offset_bytes += bias_bytes; } @@ -106,7 +104,7 @@ class ConvolutionCPU : public Convolution { weight_offset += weight_bytes; if (this->hasBias) { this->biasTensors[j].alloc(); - memcpy(((CpuMemory *)(this->biasTensors[j].get_memory()))->get_ptr(), + UNI_MEMCPY(((CpuMemory *)(this->biasTensors[j].get_memory()))->get_ptr(), curOpWs.vec + bias_offset, bias_bytes); bias_offset += bias_bytes; } @@ -118,10 +116,10 @@ class ConvolutionCPU : public Convolution { U8 *ptr = (U8 *)((CpuMemory *)(this->biasTensors[j].get_memory()))->get_ptr(); UNI_INIT(p.num_outputs, DT_F16, 1.0, ptr); ptr += bias_bytes / 2; - memset(ptr, 0, bias_bytes / 2); // second half is bias + UNI_MEMSET(ptr, 0, bias_bytes / 2); // second half is bias #endif } else { - memset(((CpuMemory *)(this->biasTensors[j].get_memory()))->get_ptr(), 0, + UNI_MEMSET(((CpuMemory *)(this->biasTensors[j].get_memory()))->get_ptr(), 0, bias_bytes); } } @@ -143,62 +141,54 @@ class ConvolutionCPU : public Convolution { outputTensor.resize(outputDesc); F32 *scalePtr = nullptr; - switch (this->p.convolution_type) { - case Convolution_Pointwise: { #if defined(_USE_INT8) - if (DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) { - TensorDesc inputDesc = inputTensor.get_desc(); - scalePtr = this->scales.get(); - scalePtr[0] = inputTensor.get_scale(); - if (featureScale.size() > 0 && featureScale[0][0] > 0) { - scalePtr[0] = featureScale[0][0]; - } else if (DT_F16 == inputDesc.dt) { - scalePtr[0] = -1; - } - if (featureScale.size() > 0 && (featureScale.back())[0] != -2) { - scalePtr[1] = (featureScale.back())[0]; - } else { - scalePtr[1] = -1; - } - } + if (DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) { + TensorDesc inputDesc = inputTensor.get_desc(); + scalePtr = this->scales.get(); + scalePtr[0] = inputTensor.get_scale(); + if (DT_I8 != inputDesc.dt && DT_U8_Q != inputDesc.dt && featureScale.size() > 0 && + featureScale[0][0] > 0) { + scalePtr[0] = featureScale[0][0]; + } + if (featureScale.size() > 0 && (featureScale.back())[0] != -2) { + scalePtr[1] = (featureScale.back())[0]; + } else { + scalePtr[1] = -1; + } + } #endif + switch (this->p.convolution_type) { + case CONVOLUTION_DILATION: + case CONVOLUTION_POINTWISE: { std::vector tmpTensors(1, this->temp); CHECK_STATUS(convolution(this->inputTensors, filterTensor, p, this->pwAlg, scalePtr, biasTensor, tmpTensors, outputTensor, this->pwActivationParamSpec, &this->archInfo)); -#if defined(_USE_INT8) - auto outputDesc = outputTensor.get_desc(); - if (DT_I8 == outputDesc.dt || DT_U8_Q == outputDesc.dt) { - outputTensor.set_scale(scalePtr[1]); - } -#endif break; } - case Convolution_Depthwise: { + case CONVOLUTION_DEPTHWISE: { CHECK_STATUS(depthwise_convolution(this->inputTensors[0], filterTensor, p, - this->dwAlg, biasTensor, this->temp, outputTensor, this->dwActivationParamSpec, - &this->archInfo)); + this->dwAlg, scalePtr, biasTensor, this->temp, outputTensor, + this->dwActivationParamSpec, &this->archInfo)); break; } - case Convolution_Depthwise_Pointwise: { + case CONVOLUTION_DEPTHWISE_POINTWISE: { std::vector tmpTensors(1, this->temp); CHECK_STATUS(depthwise_pointwise_convolution(this->inputTensors, filterTensor, - weightTensors[1], p, this->dwAlg, biasTensor, biasTensors[1], tmpTensors, - outputTensor, this->dwActivationParamSpec, this->pwActivationParamSpec, - &this->archInfo)); - break; - } - case Convolution_Dilation: { - std::vector tmpTensors(1, this->temp); - CHECK_STATUS(convolution(this->inputTensors, filterTensor, p, this->pwAlg, scalePtr, - biasTensor, tmpTensors, outputTensor, this->pwActivationParamSpec, - &this->archInfo)); + weightTensors[1], p, this->dwAlg, scalePtr, biasTensor, biasTensors[1], + tmpTensors, outputTensor, this->dwActivationParamSpec, + this->pwActivationParamSpec, &this->archInfo)); break; } default: { UNI_ERROR_LOG("unsupported convolution type %d\n", this->p.convolution_type); } } +#if defined(_USE_INT8) + if (DT_I8 == outputDesc.dt || DT_U8_Q == outputDesc.dt) { + outputTensor.set_scale(scalePtr[1]); + } +#endif inputTensor.resize(oriInputDesc); outputTensor.resize(oriOutputDesc); } @@ -220,7 +210,8 @@ class ConvolutionCPU : public Convolution { DataType targetType = filterDesc.dt; I32 algo; switch (this->p.convolution_type) { - case Convolution_Pointwise: { + case CONVOLUTION_DILATION: + case CONVOLUTION_POINTWISE: { if (this->dt == DT_F16_8Q || this->dt == DT_F32_8Q) { #ifndef _USE_X86 targetType = DT_I8; @@ -244,7 +235,7 @@ class ConvolutionCPU : public Convolution { } break; } - case Convolution_Depthwise: { + case CONVOLUTION_DEPTHWISE: { if (algorithmMap->getAlgorithmInfoFromMap(this->name, &algo, 1)) { this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo; } else { @@ -256,7 +247,7 @@ class ConvolutionCPU : public Convolution { } break; } - case Convolution_Depthwise_Pointwise: { + case CONVOLUTION_DEPTHWISE_POINTWISE: { if (algorithmMap->getAlgorithmInfoFromMap(this->name, &algo, 1)) { this->dwAlg = (DepthwiseConvolutionForwardAlgorithm)algo; } else { @@ -269,20 +260,9 @@ class ConvolutionCPU : public Convolution { } break; } - case Convolution_Dilation: { - if (algorithmMap->getAlgorithmInfoFromMap(this->name, &algo, 1)) { - this->pwAlg = (ConvolutionForwardAlgorithm)algo; - } else { - CHECK_STATUS(convolution_infer_forward_algorithm(inputTensor, filterTensor, - outputTensor, p, policy, &(this->pwAlg), targetType, - this->pwActivationParamSpec, &this->archInfo)); - algo = this->pwAlg; - algorithmMap->setAlgorithmInfoToMap(this->name, &algo, 1); - } - break; - } default: - CHECK_STATUS(NOT_SUPPORTED); + UNI_ERROR_LOG("not support to infer new type convolution's algorithm.\n"); + return NOT_SUPPORTED; } inputTensor.resize(oriInputDesc); outputTensor.resize(oriOutputDesc); @@ -317,7 +297,9 @@ class ConvolutionCPU : public Convolution { } } DataType targetType = this->dt; - if (Convolution_Pointwise == this->p.convolution_type) { + int numChannels = ic; + if (this->p.convolution_type == CONVOLUTION_DILATION || + this->p.convolution_type == CONVOLUTION_POINTWISE) { if (DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) { #ifndef _USE_X86 targetType = DT_I8; @@ -325,10 +307,6 @@ class ConvolutionCPU : public Convolution { targetType = DT_U8_Q; #endif } - } - int numChannels = ic; - if (this->p.convolution_type == Convolution_Dilation || - this->p.convolution_type == Convolution_Pointwise) { numChannels /= this->p.group; } @@ -338,14 +316,14 @@ class ConvolutionCPU : public Convolution { channelAxis = 4; filterDesc.push_back(tensor5d(this->dt, this->p.num_outputs, numChannels, this->p.kernel_t, this->p.kernel_h, this->p.kernel_w)); - if (Convolution_Depthwise_Pointwise == this->p.convolution_type) { + if (CONVOLUTION_DEPTHWISE_POINTWISE == this->p.convolution_type) { filterDesc.push_back(tensor5d(this->dt, this->p.num_outputs, numChannels, 1, 1, 1)); } } else if (tensorIs4d(inDim)) { channelAxis = 3; filterDesc.push_back(tensor4d( this->dt, this->p.num_outputs, numChannels, this->p.kernel_h, this->p.kernel_w)); - if (Convolution_Depthwise_Pointwise == this->p.convolution_type) { + if (CONVOLUTION_DEPTHWISE_POINTWISE == this->p.convolution_type) { filterDesc.push_back(tensor4d(this->dt, this->p.num_outputs, numChannels, 1, 1)); } } @@ -354,13 +332,14 @@ class ConvolutionCPU : public Convolution { filterTensor[i].resize(filterDesc[i]); } switch (this->p.convolution_type) { - case Convolution_Pointwise: { + case CONVOLUTION_DILATION: + case CONVOLUTION_POINTWISE: { biasDesc.push_back(tensor1d(this->dt, this->p.num_outputs)); CHECK_STATUS(convolution_infer_output_size( inputTensor, filterTensor[0], p, outputTensor, targetType, &this->archInfo)); break; } - case Convolution_Depthwise: { + case CONVOLUTION_DEPTHWISE: { filterDesc[0].dims[channelAxis] = 1; filterTensor[0].resize(filterDesc[0]); biasDesc.push_back(tensor1d(this->dt, this->p.num_outputs)); @@ -368,7 +347,7 @@ class ConvolutionCPU : public Convolution { inputTensor, filterTensor[0], p, outputTensor, targetType, &this->archInfo)); break; } - case Convolution_Depthwise_Pointwise: { + case CONVOLUTION_DEPTHWISE_POINTWISE: { filterDesc[0].dims[channelAxis] = 1; filterTensor[0].resize(filterDesc[0]); biasDesc.push_back(tensor1d(this->dt, numChannels)); @@ -377,14 +356,9 @@ class ConvolutionCPU : public Convolution { filterTensor[0], filterTensor[1], p, outputTensor, targetType, &this->archInfo)); break; } - case Convolution_Dilation: { - biasDesc.push_back(tensor1d(this->dt, this->p.num_outputs)); - CHECK_STATUS(convolution_infer_output_size( - inputTensor, filterTensor[0], p, outputTensor, targetType, &this->archInfo)); - break; - } default: - CHECK_STATUS(NOT_SUPPORTED); + UNI_ERROR_LOG("not support to infer new type convolution's output.\n"); + return NOT_SUPPORTED; } TensorDesc outputDesc = outputTensor->get_desc(); if (featureScale.size() > 0 && -2 == (featureScale.back())[0]) { @@ -435,29 +409,26 @@ class ConvolutionCPU : public Convolution { U32 bytes = 0; switch (this->p.convolution_type) { - case Convolution_Pointwise: { + case CONVOLUTION_DILATION: + case CONVOLUTION_POINTWISE: { CHECK_STATUS(convolution_infer_forward_tmp_bytes(inputTensor, filterTensor, outputTensor, p, this->pwAlg, &bytes, &this->archInfo)); break; } - case Convolution_Depthwise: { + case CONVOLUTION_DEPTHWISE: { CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes(inputTensor, filterTensor, outputTensor, p, this->dwAlg, &bytes, &this->archInfo)); break; } - case Convolution_Depthwise_Pointwise: { + case CONVOLUTION_DEPTHWISE_POINTWISE: { CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_tmp_bytes(inputTensor, filterTensor, this->weightTensors[1], outputTensor, p, this->dwAlg, &bytes, &this->archInfo)); break; } - case Convolution_Dilation: { - CHECK_STATUS(convolution_infer_forward_tmp_bytes(inputTensor, filterTensor, - outputTensor, p, this->pwAlg, &bytes, &this->archInfo)); - break; - } default: - CHECK_STATUS(NOT_SUPPORTED); + UNI_ERROR_LOG("not support to infer new type convolution's tmp memory.\n"); + break; } inputTensor.resize(oriInputDesc); outputTensor.resize(oriOutputDesc); @@ -469,28 +440,26 @@ class ConvolutionCPU : public Convolution { auto filterTensor = this->weightTensors[0]; U32 bytes = 0; switch (this->p.convolution_type) { - case Convolution_Pointwise: { + case CONVOLUTION_DILATION: + case CONVOLUTION_POINTWISE: { CHECK_STATUS(convolution_transform_filter_bytes( filterTensor, this->p, this->pwAlg, &bytes, &this->archInfo)); break; } - case Convolution_Depthwise: { + case CONVOLUTION_DEPTHWISE: { CHECK_STATUS(depthwise_convolution_transform_filter_bytes( filterTensor, this->p, this->dwAlg, &bytes, &this->archInfo)); break; } - case Convolution_Depthwise_Pointwise: { + case CONVOLUTION_DEPTHWISE_POINTWISE: { CHECK_STATUS(depthwise_pointwise_convolution_transform_filter_bytes(filterTensor, weightTensors[1], this->p, this->dwAlg, &bytes, bytesExtra, &this->archInfo)); break; } - case Convolution_Dilation: { - CHECK_STATUS(convolution_transform_filter_bytes( - filterTensor, this->p, this->pwAlg, &bytes, &this->archInfo)); - break; - } default: - CHECK_STATUS(NOT_SUPPORTED); + UNI_ERROR_LOG("not support to infer new type convolution's tramsform filter tmp " + "memory.\n"); + break; } return bytes; } @@ -501,9 +470,10 @@ class ConvolutionCPU : public Convolution { this->wtm = std::shared_ptr(new Tensor()); TensorDesc wtmDesc; + // int8 winograd if ((DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) && - Convolution_Pointwise == this->p.convolution_type && - CONVOLUTION_ALGORITHM_WINOGRAD == this->pwAlg) { // int8 winograd + CONVOLUTION_POINTWISE == this->p.convolution_type && + CONVOLUTION_ALGORITHM_WINOGRAD == this->pwAlg) { #if defined(_USE_INT8) U32 ftBytes; CHECK_STATUS(convolution_transform_filter_bytes( @@ -525,24 +495,32 @@ class ConvolutionCPU : public Convolution { this->scales = std::shared_ptr((F32 *)operator new(38 * bytesOf(DT_F32))); CHECK_STATUS( quantize(tFilter, this->wtm.get(), this->scales.get() + 2, &(this->archInfo))); + // int8 tilegemm } else if ((DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) && - Convolution_Pointwise == this->p.convolution_type) { // int8 tilegemm + (CONVOLUTION_POINTWISE == this->p.convolution_type || + CONVOLUTION_DILATION == this->p.convolution_type)) { TensorDesc qDesc = filterTensor.get_desc(); - qDesc.dt = DT_I8; - Tensor qFilterTensor = Tensor::alloc_sized(qDesc); this->scales = std::shared_ptr((F32 *)operator new(3 * bytesOf(DT_F32))); - this->scales.get()[2] = -1; - CHECK_STATUS( - quantize(filterTensor, &qFilterTensor, this->scales.get() + 2, &(this->archInfo))); + if (qDesc.dt != DT_I8) { + qDesc.dt = DT_I8; + Tensor qFilterTensor = Tensor::alloc_sized(qDesc); + this->scales.get()[2] = -1; + CHECK_STATUS(quantize( + filterTensor, &qFilterTensor, this->scales.get() + 2, &(this->archInfo))); + filterTensor = qFilterTensor; + filterTensor.set_scale(this->scales.get()[2]); + } else { + this->scales.get()[2] = filterTensor.get_scale(); + } U32 ftmBytes; CHECK_STATUS(convolution_transform_filter_bytes( - qFilterTensor, this->p, this->pwAlg, &ftmBytes, &this->archInfo)); + filterTensor, this->p, this->pwAlg, &ftmBytes, &this->archInfo)); *(this->wtm.get()) = Tensor::alloc_sized(tensor1d(DT_U8, ftmBytes)); // trans filter CHECK_STATUS(convolution_transform_filter( - qFilterTensor, this->p, this->pwAlg, this->temp, this->wtm.get(), &this->archInfo)); + filterTensor, this->p, this->pwAlg, this->temp, this->wtm.get(), &this->archInfo)); #endif } else { // All other cases U32 bytesExtra; @@ -551,17 +529,18 @@ class ConvolutionCPU : public Convolution { wtm->alloc(); switch (this->p.convolution_type) { - case Convolution_Pointwise: { + case CONVOLUTION_DILATION: + case CONVOLUTION_POINTWISE: { CHECK_STATUS(convolution_transform_filter(filterTensor, this->p, this->pwAlg, this->temp, this->wtm.get(), &this->archInfo)); break; } - case Convolution_Depthwise: { + case CONVOLUTION_DEPTHWISE: { CHECK_STATUS(depthwise_convolution_transform_filter( filterTensor, this->p, this->dwAlg, this->wtm.get(), &this->archInfo)); break; } - case Convolution_Depthwise_Pointwise: { + case CONVOLUTION_DEPTHWISE_POINTWISE: { Tensor pwTensor; pwTensor.resize(tensor1d(DT_U8, bytesExtra)); pwTensor.alloc(); @@ -571,13 +550,9 @@ class ConvolutionCPU : public Convolution { weightTensors[1] = pwTensor; break; } - case Convolution_Dilation: { - CHECK_STATUS(convolution_transform_filter(filterTensor, this->p, this->pwAlg, - this->temp, this->wtm.get(), &this->archInfo)); - break; - } default: - CHECK_STATUS(NOT_SUPPORTED); + UNI_ERROR_LOG("not support to transform new type convolution's filter.\n"); + return NOT_SUPPORTED; } } this->weightTensors[0] = *this->get_wtm(); diff --git a/inference/engine/include/cpu/copy_cpu.hpp b/inference/engine/include/cpu/copy_cpu.hpp index 83f482c3..f1828570 100644 --- a/inference/engine/include/cpu/copy_cpu.hpp +++ b/inference/engine/include/cpu/copy_cpu.hpp @@ -43,11 +43,11 @@ class CopyCPU : public Copy { U32 copyLength = (this->p.length >= 0) ? this->p.length : tensorNumElements(srcDesc) / batch; U32 srcBatchStride = (this->p.src_dims[0] >= 0) ? this->p.src_dims[0] : tensorNumElements(srcDesc) / batch; - U32 srcStride = (this->p.src_dims[0] >= 0) ? this->p.src_dims[1] + U32 srcStride = (this->p.src_dims[1] >= 0) ? this->p.src_dims[1] : tensorNumElements(srcDesc) / batch; U32 dstBatchStride = (this->p.dst_dims[0] >= 0) ? this->p.dst_dims[0] : tensorNumElements(dstDesc) / batch; - U32 dstStride = (this->p.dst_dims[0] >= 0) ? this->p.dst_dims[1] + U32 dstStride = (this->p.dst_dims[1] >= 0) ? this->p.dst_dims[1] : tensorNumElements(dstDesc) / batch; for (U32 i = 0; i < batch; i++) { U32 srcBlockIndex = 0; diff --git a/inference/engine/include/cpu/cumsum_cpu.hpp b/inference/engine/include/cpu/cumsum_cpu.hpp new file mode 100644 index 00000000..0508a98a --- /dev/null +++ b/inference/engine/include/cpu/cumsum_cpu.hpp @@ -0,0 +1,45 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CUMSUM_CPU_H +#define _CUMSUM_CPU_H + +#include "cumsum.hpp" + +class CumSumCPU : public CumSum { +public: + CumSumCPU(DataType dt, CumSumParamSpec p) : CumSum(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new CumSumCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS(cumsum(inputTensors[0], this->p, outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(cumsum_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // CUMSUM_CPU_H diff --git a/inference/engine/include/cpu/deconvolution_cpu.hpp b/inference/engine/include/cpu/deconvolution_cpu.hpp index caadfe7a..6d7d467a 100644 --- a/inference/engine/include/cpu/deconvolution_cpu.hpp +++ b/inference/engine/include/cpu/deconvolution_cpu.hpp @@ -33,20 +33,18 @@ class DeconvolutionCPU : public Deconvolution { EE infer_weight_desc() override { auto curOpWs = this->get_weightspec(); - DataType filterDt = curOpWs.mdt; // weight data type may not be the same as input and output + DataType fdt = curOpWs.mdt; if (curOpWs.weight == nullptr) { - filterDt = this->dt; + fdt = this->dt; } - DataType dtNoQ = (this->dt == DT_F16_8Q) ? DT_F16 : this->dt; - CHECK_REQUIREMENT(filterDt != DT_BIN01 && filterDt != DT_BIN11); - DataFormat filterDf = DF_NCHW; - TensorDesc filterTensorDesc = tensor4df(filterDt, filterDf, this->numInputs, - this->p.num_outputs, this->p.kernel_h, this->p.kernel_w); - // bias length - U32 vectorLen = this->numInputs * this->p.group; + if (fdt == DT_BIN01 || fdt == DT_BIN11) { + return NOT_MATCH; + } + TensorDesc filterTensorDesc = tensor4df( + fdt, DF_NCHW, this->numInputs, this->p.num_outputs, this->p.kernel_h, this->p.kernel_w); // bias data type should be the same as input and output - TensorDesc vectorTensorDesc = tensor1d(dtNoQ, vectorLen); - + DataType dtNoQ = (dt == DT_F16_8Q) ? DT_F16 : ((dt == DT_F32_8Q) ? DT_F32 : dt); + TensorDesc vectorTensorDesc = tensor1d(dtNoQ, this->numInputs * this->p.group); this->weightTensors = std::vector(1); this->weightTensors[0].resize(filterTensorDesc); this->biasTensors = std::vector(1); @@ -62,13 +60,8 @@ class DeconvolutionCPU : public Deconvolution { Tensor outputTensor = this->outputTensors[0]; TensorDesc oriOutputDesc = outputTensor.get_desc(); outputTensor.resize(transformDescTo4d(oriOutputDesc)); - Tensor filterTensor = this->weightTensors[0]; Tensor biasTensor = this->biasTensors[0]; - auto filterDesc = filterTensor.get_desc(); - if (filterDesc.dt == DT_BIN01 || filterDesc.dt == DT_BIN11) { - CHECK_STATUS(NOT_SUPPORTED); - } CHECK_STATUS(deconvolution(inputTensor, filterTensor, p, this->alg, nullptr, biasTensor, this->temp, outputTensor, this->activationDesc, &this->archInfo)); inputTensor.resize(oriInputDesc); @@ -120,7 +113,7 @@ class DeconvolutionCPU : public Deconvolution { filterTensor.resize(filterDim); DataType targetType = this->dt; - if (DT_F16_8Q == this->dt) { + if (DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) { targetType = DT_I8; } @@ -167,10 +160,10 @@ class DeconvolutionCPU : public Deconvolution { Tensor filterTensor = this->weightTensors[0]; auto wtmBytes = this->infer_wtm_memory_size(); Tensor wtm = Tensor::alloc_sized(tensor1d(DT_U8, wtmBytes)); - CHECK_STATUS(deconvolution_transform_filter( - filterTensor, this->p, this->alg, this->temp, &wtm, &this->archInfo)); + EE ret = deconvolution_transform_filter( + filterTensor, this->p, this->alg, this->temp, &wtm, &this->archInfo); this->weightTensors[0] = wtm; - return SUCCESS; + return ret; } }; diff --git a/inference/engine/include/cpu/depth2space_cpu.hpp b/inference/engine/include/cpu/depth2space_cpu.hpp new file mode 100644 index 00000000..ba673f84 --- /dev/null +++ b/inference/engine/include/cpu/depth2space_cpu.hpp @@ -0,0 +1,47 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _DEPTH2SPACE_CPU_H +#define _DEPTH2SPACE_CPU_H + +#include "depth2space.hpp" + +class Depth2SpaceCPU : public Depth2Space { +public: + Depth2SpaceCPU(DataType dt, Depth2SpaceParamSpec p) : Depth2Space(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new Depth2SpaceCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS( + depth2space(inputTensors[0], this->p, this->temp, outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS( + depth2space_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // DEPTH2SPACE_CPU_H diff --git a/inference/engine/include/cpu/eltwise_cpu.hpp b/inference/engine/include/cpu/eltwise_cpu.hpp index 086f858c..14d1581a 100644 --- a/inference/engine/include/cpu/eltwise_cpu.hpp +++ b/inference/engine/include/cpu/eltwise_cpu.hpp @@ -18,71 +18,26 @@ class EltwiseCPU : public Eltwise { public: - EltwiseCPU(EltwiseParamSpec eltwiseDesc) : Eltwise(eltwiseDesc) + EltwiseCPU(EltwiseParamSpec p) : Eltwise(p) {} std::shared_ptr clone() override { - std::shared_ptr mem = - std::shared_ptr(new EltwiseCPU(this->eltwiseDesc)); + std::shared_ptr mem = std::shared_ptr(new EltwiseCPU(this->p)); *mem = *this; return mem; } - bool use_scale(const std::vector &inputDesc) - { - bool ret; - if (this->eltwiseDesc.elt_mode == ELTWISE_PROD && inputDesc.size() == 2 && - inputDesc[0].nDims > 1 && inputDesc[1].nDims > 1 && - inputDesc[0].dims[inputDesc[0].nDims - 2] == inputDesc[1].dims[inputDesc[1].nDims - 2] && - inputDesc[1].dims[inputDesc[1].nDims - 1] == 1 && - (inputDesc[1].nDims == 2 || (inputDesc[1].nDims == 3 && inputDesc[1].dims[0] == 1) || - (inputDesc[1].nDims == 4 && inputDesc[1].dims[0] == 1 && inputDesc[1].dims[1] == 1)) && - tensorNumElements(inputDesc[0]) != tensorNumElements(inputDesc[1])) { - ret = true; - } else { - ret = false; - } - return ret; - } - void run() override { - std::vector inputDesc; - for (auto p : this->inputTensors) { - inputDesc.push_back(p.get_desc()); - } - if (this->use_scale(inputDesc)) { - Tensor inTensor = this->inputTensors[1]; - U8 *alpha = (U8 *)((CpuMemory *)(inTensor.get_memory()))->get_ptr(); - ScaleParamSpec scaleParam; - scaleParam.axis = 1; - CHECK_STATUS(scale(this->inputTensors[0], alpha, nullptr, scaleParam, - this->outputTensors[0], &this->archInfo)); - } else { - CHECK_STATUS(eltwise(this->inputTensors, this->eltwiseDesc, this->temp, - this->outputTensors[0], &this->archInfo)); - } + CHECK_STATUS(eltwise( + this->inputTensors, this->p, this->temp, this->outputTensors[0], &this->archInfo)); } EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { - std::vector inputDesc; - for (auto p : inTensors) { - inputDesc.push_back(p->get_desc()); - } - if (this->use_scale(inputDesc)) { - ScaleParamSpec scaleParam; - scaleParam.axis = 1; - TensorDesc desc = inTensors[1]->get_desc(); - U32 axisLen = desc.dims[desc.nDims - 2]; - CHECK_STATUS(scale_infer_output_size( - inTensors[0], scaleParam, axisLen, outTensors[0], &this->archInfo)); - } else { - CHECK_STATUS(eltwise_infer_output_size(inTensors, outTensors[0], &this->archInfo)); - } - return SUCCESS; + return eltwise_infer_output_size(inTensors, outTensors[0], &this->archInfo); } }; diff --git a/inference/engine/include/cpu/embedding_cpu.hpp b/inference/engine/include/cpu/embedding_cpu.hpp index 410d3c9c..d0a225d3 100644 --- a/inference/engine/include/cpu/embedding_cpu.hpp +++ b/inference/engine/include/cpu/embedding_cpu.hpp @@ -81,9 +81,9 @@ class EmbeddingCPU : public Embedding { } TensorDesc weightDesc; if (this->p.transpose) { - weightDesc = tensor2df(this->dt, DF_TRANSPOSE, this->p.num_output, this->p.input_dim); + weightDesc = tensor2df(this->dt, DF_TRANSPOSE, this->p.num_outputs, this->p.num_inputs); } else { - weightDesc = tensor2df(this->dt, DF_NORMAL, this->p.input_dim, this->p.num_output); + weightDesc = tensor2df(this->dt, DF_NORMAL, this->p.num_inputs, this->p.num_outputs); } U32 weightBytes = tensorNumBytes(weightDesc); @@ -93,15 +93,15 @@ class EmbeddingCPU : public Embedding { bool set_ptr = false; modelWeightTensor->alloc(); if (modelPtr != nullptr) { - memcpy( + UNI_MEMCPY( ((CpuMemory *)(modelWeightTensor->get_memory()))->get_ptr(), modelPtr, weightBytes); *modelPtrShared = std::shared_ptr(*modelPtrShared, modelPtr + weightBytes); set_ptr = true; } else { auto curOpWs = this->get_weightspec(); if (curOpWs.weight != nullptr) { - memcpy(((CpuMemory *)(modelWeightTensor->get_memory()))->get_ptr(), curOpWs.weight, - weightBytes); + UNI_MEMCPY(((CpuMemory *)(modelWeightTensor->get_memory()))->get_ptr(), + curOpWs.weight, weightBytes); set_ptr = true; } } diff --git a/inference/engine/include/cpu/equal_cpu.hpp b/inference/engine/include/cpu/equal_cpu.hpp index f0253911..5fae144e 100644 --- a/inference/engine/include/cpu/equal_cpu.hpp +++ b/inference/engine/include/cpu/equal_cpu.hpp @@ -44,10 +44,9 @@ class EqualCPU : public Equal { EE infer_weight_desc() override { auto curOpWs = this->get_weightspec(); - int weightBytes = curOpWs.bytes_of_weight; - int weightLen = weightBytes / bytesOf(curOpWs.mdt); this->weightTensors = std::vector(1); - this->weightTensors[0].resize(tensor2d(this->dt, 1, weightLen)); + this->weightTensors[0].resize( + tensor2d(curOpWs.mdt, 1, curOpWs.bytes_of_weight / bytesOf(curOpWs.mdt))); return SUCCESS; } }; diff --git a/inference/engine/include/cpu/expand_cpu.hpp b/inference/engine/include/cpu/expand_cpu.hpp index 09ebdf37..d57e2756 100644 --- a/inference/engine/include/cpu/expand_cpu.hpp +++ b/inference/engine/include/cpu/expand_cpu.hpp @@ -29,17 +29,36 @@ class ExpandCPU : public Expand { return mem; } + ExpandParamSpec get_param(TensorDesc desc) + { + ExpandParamSpec ps = this->p; + if (ps.num_shape == 0) { + ps.num_shape = desc.dims[0]; + for (int i = 0; i < ps.num_shape; i++) { + ps.shape[i] = desc.dims[desc.nDims + i]; + } + } + return ps; + } + void run() override { - CHECK_STATUS(expand( - this->inputTensors[0], this->p, this->temp, this->outputTensors[0], &this->archInfo)); + ExpandParamSpec ps = p; + if (ps.num_shape == 0 && inputTensors.size() > 1) { + ps = get_param(inputTensors[1].get_desc()); + } + CHECK_STATUS( + expand(this->inputTensors[0], ps, this->temp, this->outputTensors[0], &this->archInfo)); } EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { - CHECK_STATUS( - expand_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + ExpandParamSpec ps = p; + if (ps.num_shape == 0 && inTensors.size() > 1) { + ps = get_param(inTensors[1]->get_desc()); + } + CHECK_STATUS(expand_infer_output_size(inTensors[0], ps, outTensors[0], &this->archInfo)); return SUCCESS; } }; diff --git a/inference/engine/include/cpu/factory_cpu.hpp b/inference/engine/include/cpu/factory_cpu.hpp index d2d4c35b..ed366356 100644 --- a/inference/engine/include/cpu/factory_cpu.hpp +++ b/inference/engine/include/cpu/factory_cpu.hpp @@ -66,7 +66,6 @@ #include "cpu/tdnn_fully_connected_cpu.hpp" #include "cpu/batch_norm_cpu.hpp" #include "cpu/cast_cpu.hpp" -#include "cpu/equal_cpu.hpp" #include "cpu/instance_norm_cpu.hpp" #include "cpu/expand_cpu.hpp" #include "cpu/scatter_cpu.hpp" @@ -74,6 +73,17 @@ #include "cpu/select_cpu.hpp" #include "cpu/topk_cpu.hpp" #include "cpu/gat_cpu.hpp" +#include "cpu/quantizelinear_cpu.hpp" +#include "cpu/grid_sample_cpu.hpp" +#include "cpu/onehot_cpu.hpp" +#include "cpu/cumsum_cpu.hpp" +#include "cpu/non_max_suppression_cpu.hpp" +#include "cpu/constant_of_shape_cpu.hpp" +#include "cpu/non_zero_cpu.hpp" +#include "cpu/roialign_cpu.hpp" +#include "cpu/range_cpu.hpp" +#include "cpu/depth2space_cpu.hpp" +#include "cpu/space2depth_cpu.hpp" class FactoryCPU : public Factory { public: @@ -173,9 +183,10 @@ class FactoryCPU : public Factory { return std::shared_ptr(cep); } - std::shared_ptr createLayerNorm(DataType dt, U32 weightNum) override + std::shared_ptr createLayerNorm( + DataType dt, LayerNormParamSpec p, U32 weightNum) override { - auto cep = (LayerNorm *)(new LayerNormCPU(dt, weightNum)); + auto cep = (LayerNorm *)(new LayerNormCPU(dt, p, weightNum)); return std::shared_ptr(cep); } @@ -264,9 +275,9 @@ class FactoryCPU : public Factory { return std::shared_ptr(cep); } - std::shared_ptr createPreAllocatedMemory(DataType dt, TensorDesc desc) override + std::shared_ptr createPreAllocatedMemory(PreAllocatedMemoryParamSpec p) override { - auto cep = (PreAllocatedMemory *)new PreAllocatedMemoryCPU(dt, desc); + auto cep = (PreAllocatedMemory *)new PreAllocatedMemoryCPU(p); return std::shared_ptr(cep); } @@ -288,13 +299,13 @@ class FactoryCPU : public Factory { std::shared_ptr createSpace2Depth(DataType dt, Space2DepthParamSpec p) override { - OP_UNSUP(2, dt, p); + auto cep = new Space2DepthCPU(dt, p); return std::shared_ptr(cep); } std::shared_ptr createDepth2Space(DataType dt, Depth2SpaceParamSpec p) override { - OP_UNSUP(2, dt, p); + auto cep = new Depth2SpaceCPU(dt, p); return std::shared_ptr(cep); } @@ -414,12 +425,6 @@ class FactoryCPU : public Factory { return std::shared_ptr(cep); } - std::shared_ptr createEqual(DataType dt, EqualParamSpec p) override - { - auto cep = new EqualCPU(dt, p); - return std::shared_ptr(cep); - } - std::shared_ptr createInstanceNorm(DataType dt, InstanceNormParamSpec p) override { auto cep = new InstanceNormCPU(dt, p); @@ -450,9 +455,9 @@ class FactoryCPU : public Factory { return std::shared_ptr(cep); } - std::shared_ptr createRoIAlign(RoIAlignParamSpec p) override + std::shared_ptr createRoIAlign(DataType dt, RoIAlignParamSpec p) override { - OP_UNSUP(1, p); + auto cep = new RoIAlignCPU(dt, p); return std::shared_ptr(cep); } @@ -468,5 +473,54 @@ class FactoryCPU : public Factory { auto cep = new GATCPU(dt, p); return std::shared_ptr(cep); } + + std::shared_ptr createQuantizeLinear(DataType dt, QuantizeLinearParamSpec p) override + { + auto cep = new QuantizeLinearCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createGridSample(DataType dt, GridSampleParamSpec p) override + { + auto cep = new GridSampleCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createOneHot(DataType dt, OneHotParamSpec p) override + { + auto cep = new OneHotCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createCumSum(DataType dt, CumSumParamSpec p) override + { + auto cep = new CumSumCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createNonMaxSuppression( + DataType dt, NonMaxSuppressionParamSpec p) override + { + auto cep = new NonMaxSuppressionCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createConstantOfShape(DataType dt, ConstantOfShapeParamSpec p) override + { + auto cep = new ConstantOfShapeCPU(dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createNonZero(DataType dt) override + { + auto cep = new NonZeroCPU(dt); + return std::shared_ptr(cep); + } + + std::shared_ptr createRange(DataType dt, RangeParamSpec p) override + { + auto cep = new RangeCPU(dt, p); + return std::shared_ptr(cep); + } }; #endif // _FACTORY_CPU_H diff --git a/inference/engine/include/cpu/fully_connected_cpu.hpp b/inference/engine/include/cpu/fully_connected_cpu.hpp index 70838487..701f2019 100644 --- a/inference/engine/include/cpu/fully_connected_cpu.hpp +++ b/inference/engine/include/cpu/fully_connected_cpu.hpp @@ -31,25 +31,15 @@ class FullyConnectedCPU : public FullyConnected { return mem; } - DataType get_float_precision() - { - DataType ret = this->dt; - if (this->dt == DT_F16_8Q) { - ret = DT_F16; - } else if (this->dt == DT_F32_8Q) { - ret = DT_F32; - } - return ret; - } - EE infer_weight_desc() override { - DataType dtNoQ = this->get_float_precision(); + DataType dtNoQ = (dt == DT_F16_8Q) ? DT_F16 : ((dt == DT_F32_8Q) ? DT_F32 : dt); auto curOpWs = this->get_weightspec(); + DataType weightDt = curOpWs.mdt; if (curOpWs.bytes_of_weight > 0) { this->weightTensors = std::vector(1); this->weightTensors[0].resize( - tensor2df(dtNoQ, DF_TRANSPOSE, this->p.num_outputs, this->numInput)); + tensor2df(weightDt, DF_TRANSPOSE, this->p.num_outputs, this->numInput)); } if (curOpWs.bytes_of_vec > 0) { this->biasTensors = std::vector(1); @@ -60,53 +50,50 @@ class FullyConnectedCPU : public FullyConnected { Tensor get_weight_tensor() { - Tensor weightTensor; if (weightTensors.size() > 0) { - weightTensor = this->weightTensors[0]; + return this->weightTensors[0]; } else { CHECK_REQUIREMENT(1 < this->inputTensors.size()); - weightTensor = this->inputTensors[1]; - TensorDesc desc = weightTensor.get_desc(); + TensorDesc desc = this->inputTensors[1].get_desc(); if (this->mvm) { desc.df = DF_TRANSPOSE; } else { desc.df = DF_NORMAL; } + Tensor weightTensor = this->inputTensors[1]; weightTensor.resize(desc); + return weightTensor; } - return weightTensor; } Tensor get_bias_tensor() { - Tensor biasTensor; - U32 inputCount = 1; - if (weightTensors.size() == 0) { - inputCount++; - } if (biasTensors.size() > 0) { - biasTensor = this->biasTensors[0]; + return this->biasTensors[0]; } else { + U32 inputCount = 1; + if (weightTensors.size() == 0) { + inputCount++; + } if (inputCount < this->inputTensors.size()) { - biasTensor = this->inputTensors[inputCount++]; + return this->inputTensors[inputCount++]; } + Tensor biasTensor; + return biasTensor; } - return biasTensor; } void run() override { - Tensor inputTensor = this->inputTensors[0]; - TensorDesc inputDesc = inputTensor.get_desc(); - Tensor weightTensor = get_weight_tensor(); Tensor biasTensor = get_bias_tensor(); Tensor outputTensor = this->outputTensors[0]; +#ifdef _USE_INT8 + TensorDesc inputDesc = this->inputTensors[0].get_desc(); TensorDesc outputDesc = outputTensor.get_desc(); - if (featureScale.size() > 1 && featureScale[0][0] > 0 && DT_I8 != inputDesc.dt && DT_U8_Q != inputDesc.dt) { - inputTensor.set_scale(featureScale[0][0]); + this->inputTensors[0].set_scale(featureScale[0][0]); } if (DT_I8 == outputDesc.dt || DT_U8_Q == outputDesc.dt) { if (featureScale.size() > 0) { @@ -115,10 +102,10 @@ class FullyConnectedCPU : public FullyConnected { outputTensor.set_scale(-1); } } - +#endif std::vector tmpTensor(1, this->temp); - CHECK_STATUS(fully_connected( - inputTensor, weightTensor, biasTensor, tmpTensor, outputTensor, &this->archInfo)); + CHECK_STATUS(fully_connected(this->inputTensors[0], weightTensor, biasTensor, tmpTensor, + outputTensor, &this->archInfo)); } EE infer_output_tensors_size( @@ -146,8 +133,8 @@ class FullyConnectedCPU : public FullyConnected { tmpFilter.resize(weightDesc); CHECK_STATUS(fully_connected_infer_output_size( inTensors[0], tmpFilter, outTensors[0], &this->archInfo)); - TensorDesc outputDesc = outTensors[0]->get_desc(); if (1 == this->p.num_slices) { + TensorDesc outputDesc = outTensors[0]->get_desc(); if (DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) { if (featureScale.size() > 0 && -2 == (featureScale.back())[0]) { outputDesc.dt = (DT_F16_8Q == this->dt) ? DT_F16 : DT_F32; @@ -161,9 +148,11 @@ class FullyConnectedCPU : public FullyConnected { } outTensors[0]->resize(outputDesc); } else { - UNI_ERROR_LOG("FC merge is deprecated\n"); + //UNI_ERROR_LOG("FC merge is deprecated\n"); for (U32 i = 0; i < this->p.num_slices; i++) { + TensorDesc outputDesc = outTensors[i]->get_desc(); outputDesc.dims[0] = this->p.slice_point[i]; + UNI_INFO_LOG("-- %d %d\n", p.num_slices, p.slice_point[i]); if (DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) { if (featureScale.size() > 0 && -2 == (featureScale.back())[0]) { outputDesc.dt = (DT_F16_8Q == this->dt) ? DT_F16 : DT_F32; @@ -175,6 +164,7 @@ class FullyConnectedCPU : public FullyConnected { #endif } } + outTensors[i]->resize(outputDesc); } } return SUCCESS; @@ -241,7 +231,7 @@ class FullyConnectedCPU : public FullyConnected { #ifdef _USE_INT8 bool thisIsNoQuant = (featureScale.size() > 1 && featureScale[0].back() == 0); - if ((DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) && !thisIsNoQuant) { + if ((DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) && !thisIsNoQuant && (tmpDesc.dt != DT_I8)) { tmpDesc.dt = DT_I8; Tensor qFilter = Tensor::alloc_sized(tmpDesc); F32 scale = -1; diff --git a/inference/engine/include/cpu/gather_cpu.hpp b/inference/engine/include/cpu/gather_cpu.hpp index 14e0e755..725b966b 100644 --- a/inference/engine/include/cpu/gather_cpu.hpp +++ b/inference/engine/include/cpu/gather_cpu.hpp @@ -39,6 +39,18 @@ class GatherCPU : public Gather { std::vector inTensors, std::vector outTensors) override { Tensor tensor0, tensor1; + if (is_shape(inTensors)) { + if ((this->p.data_desc.nDims > 0 && this->weightTensors.size() == 0) || + (this->p.index_desc.nDims > 0 && this->biasTensors.size() == 0)) { + CHECK_STATUS(this->init_weight_bias_from_model()); + } + if (this->p.data_desc.nDims > 0) { + this->p.data_desc = tensor_shape(this->weightTensors[0]); + } + if (this->p.index_desc.nDims > 0) { + this->p.index_desc = tensor_shape(this->biasTensors[0]); + } + } Tensor *dataTensor = get_data_tensor_ptr(inTensors, &tensor0); Tensor *indexTensor = get_index_tensor_ptr(inTensors, &tensor1); CHECK_STATUS(gather_infer_output_size( @@ -49,11 +61,11 @@ class GatherCPU : public Gather { EE infer_weight_desc() override { Tensor dataTensor, indexTensor; - if (this->p.data_desc.nDims > 0) { + if (this->p.data_desc.nDims > 0 && this->weightTensors.size() == 0) { dataTensor.resize(this->p.data_desc); this->weightTensors.push_back(dataTensor); } - if (this->p.index_desc.nDims > 0) { + if (this->p.index_desc.nDims > 0 && this->biasTensors.size() == 0) { indexTensor.resize(this->p.index_desc); this->biasTensors.push_back(indexTensor); } diff --git a/inference/engine/include/cpu/grid_sample_cpu.hpp b/inference/engine/include/cpu/grid_sample_cpu.hpp new file mode 100644 index 00000000..c64782da --- /dev/null +++ b/inference/engine/include/cpu/grid_sample_cpu.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _GRID_SAMPLE_CPU_H +#define _GRID_SAMPLE_CPU_H + +#include "grid_sample.hpp" + +class GridSampleCPU : public GridSample { +public: + GridSampleCPU(DataType dt, GridSampleParamSpec p) : GridSample(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new GridSampleCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS(grid_sample(inputTensors[0], inputTensors[1], this->p, this->temp, + outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + return grid_sample_infer_output_size( + inTensors[0], inTensors[1], outTensors[0], &this->archInfo); + } +}; + +#endif // GRID_SAMPLE_CPU_H diff --git a/inference/engine/include/cpu/instance_norm_cpu.hpp b/inference/engine/include/cpu/instance_norm_cpu.hpp index 35d9dbb2..f2ca9dd6 100644 --- a/inference/engine/include/cpu/instance_norm_cpu.hpp +++ b/inference/engine/include/cpu/instance_norm_cpu.hpp @@ -38,33 +38,29 @@ class InstanceNormCPU : public InstanceNorm { EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { - auto inputDesc = inTensors[0]->get_desc(); - this->set_channels_from_weight(); - TensorDesc outputDesc = inputDesc; - outTensors[0]->resize(outputDesc); + outTensors[0]->resize(inTensors[0]->get_desc()); return SUCCESS; } - void set_channels_from_weight() + int get_channels_num() { + int ret = 0; auto curOpWs = this->get_weightspec(); if (0 != curOpWs.bytes_of_weight) { - this->numChannels = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); + ret = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); } else if (0 != curOpWs.bytes_of_vec) { - this->numChannels = curOpWs.bytes_of_vec / UNI_MAX(1, bytesOf(curOpWs.mdt)); - } else { - this->numChannels = 0; + ret = curOpWs.bytes_of_vec / UNI_MAX(1, bytesOf(curOpWs.mdt)); } + return ret; } EE infer_weight_desc() override { - // weight is scale, bias is bias - this->set_channels_from_weight(); + int num = this->get_channels_num(); this->weightTensors = std::vector(1); - this->weightTensors[0].resize(tensor1d(this->dt, this->numChannels)); + this->weightTensors[0].resize(tensor1d(this->dt, num)); this->biasTensors = std::vector(1); - this->biasTensors[0].resize(tensor1d(this->dt, this->numChannels)); + this->biasTensors[0].resize(tensor1d(this->dt, num)); return SUCCESS; } diff --git a/inference/engine/include/cpu/layer_norm_cpu.hpp b/inference/engine/include/cpu/layer_norm_cpu.hpp index da9dbb27..fa88cc9a 100644 --- a/inference/engine/include/cpu/layer_norm_cpu.hpp +++ b/inference/engine/include/cpu/layer_norm_cpu.hpp @@ -18,13 +18,13 @@ class LayerNormCPU : public LayerNorm { public: - LayerNormCPU(DataType dt, U32 weightNum) : LayerNorm(dt, weightNum) + LayerNormCPU(DataType dt, LayerNormParamSpec p, U32 weightNum) : LayerNorm(dt, p, weightNum) {} std::shared_ptr clone() override { std::shared_ptr mem = - std::shared_ptr(new LayerNormCPU(this->dt, this->weightNum)); + std::shared_ptr(new LayerNormCPU(this->dt, this->p, this->weightNum)); *mem = *this; return mem; } @@ -32,7 +32,7 @@ class LayerNormCPU : public LayerNorm { EE infer_weight_desc() override { auto curOpWs = this->get_weightspec(); - DataType dtNoQ = (DT_F16_8Q == this->dt) ? DT_F16 : this->dt; + DataType dtNoQ = (dt == DT_F16_8Q) ? DT_F16 : ((dt == DT_F32_8Q) ? DT_F32 : dt); if (0 != curOpWs.bytes_of_weight) { this->weightNum = curOpWs.bytes_of_weight / bytesOf(curOpWs.mdt); } @@ -67,14 +67,14 @@ class LayerNormCPU : public LayerNorm { Tensor biasTensor = this->biasTensors[0]; Tensor outputTensor = this->outputTensors[0]; - CHECK_STATUS(layer_normalization( - inputTensor, weightTensor, biasTensor, this->temp, outputTensor, &this->archInfo)); + CHECK_STATUS(layer_normalization(inputTensor, this->p, weightTensor, biasTensor, this->temp, + outputTensor, &this->archInfo)); } EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { - CHECK_STATUS(normalization_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + EE ret = normalization_infer_output_size(inTensors[0], outTensors[0], &this->archInfo); #ifdef _USE_INT8 if (DT_F16_8Q == this->dt || DT_F32_8Q == this->dt) { if (featureScale.size() > 0 && -1 == (featureScale.back())[0]) { @@ -88,7 +88,7 @@ class LayerNormCPU : public LayerNorm { } } #endif - return SUCCESS; + return ret; } }; diff --git a/inference/engine/include/cpu/logsoftmax_cpu.hpp b/inference/engine/include/cpu/logsoftmax_cpu.hpp index 20d0c1a1..9be1a737 100644 --- a/inference/engine/include/cpu/logsoftmax_cpu.hpp +++ b/inference/engine/include/cpu/logsoftmax_cpu.hpp @@ -16,18 +16,10 @@ #include "cpu/softmax_cpu.hpp" -// LOGSOFTMAX_CPU_V1: y = log(softmax(x)) -// LOGSOFTMAX_CPU_V2: y = (x - reduce_max) - log(reduce_sum(exp(x - reduce_max))) class LogSoftmaxCPU : public SoftmaxCPU { public: LogSoftmaxCPU(DataType dt, SoftmaxParamSpec p) : SoftmaxCPU(dt, p) - { -#ifndef LOGSOFTMAX_CPU_V1 - TensorDesc maskDesc; - maskDesc.nDims = 0; - reductionMask.resize(maskDesc); -#endif - } + {} OperatorType get_type() override { @@ -44,79 +36,8 @@ class LogSoftmaxCPU : public SoftmaxCPU { void run() override { -#ifdef LOGSOFTMAX_CPU_V1 - ActivationParamSpec activationDesc; - activationDesc.mode = ACTIVATION_LOG; - CHECK_STATUS( - softmax(inputTensors[0], this->p, this->temp, outputTensors[0], &this->archInfo)); - CHECK_STATUS( - activation(outputTensors[0], activationDesc, outputTensors[0], &this->archInfo)); -#else - Tensor tmp, newInput; - U8 *data = (U8 *)((CpuMemory *)(this->temp.get_memory()))->get_ptr(); - std::shared_ptr p1(data, [](U8 *ptr) {}); - newInput.resize(inputTensors[0].get_desc()); - ((CpuMemory *)(reductionResult.get_memory()))->set_shared_ptr(p1); - std::shared_ptr p2(data + reductionResult.bytes(), [](U8 *ptr) {}); - ((CpuMemory *)(newInput.get_memory()))->set_shared_ptr(p2); - std::shared_ptr p3(data + reductionResult.bytes() + newInput.bytes(), [](U8 *ptr) {}); - ((CpuMemory *)(tmp.get_memory()))->set_shared_ptr(p3); - - ReductionParamSpec reductionSpec = get_reduction_param(); - reductionSpec.reduction_mode = REDUCTION_MAX; - CHECK_STATUS(reduction( - inputTensors[0], reductionMask, reductionSpec, tmp, reductionResult, &this->archInfo)); - EltwiseParamSpec eltwiseSpec; - eltwiseSpec.elt_mode = ELTWISE_SUB; - eltwiseSpec.activation_type = ACTIVATION_NULL; - std::vector tmpInput = {inputTensors[0], reductionResult}; - CHECK_STATUS(eltwise(tmpInput, eltwiseSpec, tmp, newInput, &this->archInfo)); - - ActivationParamSpec activationSpec; - activationSpec.mode = ACTIVATION_EXP; - CHECK_STATUS(activation(newInput, activationSpec, outputTensors[0], &this->archInfo)); - - CHECK_STATUS(reduction(outputTensors[0], reductionMask, get_reduction_param(), tmp, - reductionResult, &this->archInfo)); - - activationSpec.mode = ACTIVATION_LOG; - CHECK_STATUS(activation(reductionResult, activationSpec, reductionResult, &this->archInfo)); - - tmpInput = {newInput, reductionResult}; - CHECK_STATUS(eltwise(tmpInput, eltwiseSpec, tmp, outputTensors[0], &this->archInfo)); -#endif - } - -#ifndef LOGSOFTMAX_CPU_V1 - ReductionParamSpec get_reduction_param() - { - ReductionParamSpec reductionSpec; - reductionSpec.axes_num = 1; - reductionSpec.axes[0] = this->p.axis; - reductionSpec.reduction_mode = REDUCTION_SUM; - reductionSpec.keep_dim = true; - reductionSpec.coeff = 1; - return reductionSpec; - } - - U32 infer_tmp_memory_size() override - { - U32 bytes1 = 0, bytes2 = 0; - CHECK_STATUS(reduction_infer_output_size(&(inputTensors[0]), reductionMask, - get_reduction_param(), &reductionResult, &this->archInfo)); - - CHECK_STATUS(reduction_infer_forward_tmp_bytes( - inputTensors[0], get_reduction_param(), reductionResult, &bytes1, &this->archInfo)); - - std::vector tmpInput = {inputTensors[0], reductionResult}; CHECK_STATUS( - eltwise_infer_forward_tmp_bytes(tmpInput, inputTensors[0], &bytes2, &this->archInfo)); - return inputTensors[0].bytes() + reductionResult.bytes() + UNI_MAX(bytes1, bytes2); + logsoftmax(inputTensors[0], this->p, this->temp, outputTensors[0], &this->archInfo)); } - -private: - Tensor reductionResult; - Tensor reductionMask; -#endif }; #endif // LOGSOFTMAX_CPU_H diff --git a/inference/engine/include/cpu/matmul_cpu.hpp b/inference/engine/include/cpu/matmul_cpu.hpp index b99c01ed..f1342cc7 100644 --- a/inference/engine/include/cpu/matmul_cpu.hpp +++ b/inference/engine/include/cpu/matmul_cpu.hpp @@ -32,16 +32,17 @@ class MatMulCPU : public MatMul { void run() override { Tensor inputTensorA = this->inputTensors[0]; - TensorDesc inputDescA = inputTensorA.get_desc(); Tensor inputTensorB = this->inputTensors[1]; - TensorDesc inputDescB = inputTensorB.get_desc(); Tensor inputTensorC; if (this->inputTensors.size() > 2) { inputTensorC = this->inputTensors[2]; } Tensor outputTensor = this->outputTensors[0]; - TensorDesc outputDesc = outputTensor.get_desc(); +#ifdef _USE_INT8 + TensorDesc inputDescA = inputTensorA.get_desc(); + TensorDesc inputDescB = inputTensorB.get_desc(); + TensorDesc outputDesc = outputTensor.get_desc(); if (3 == featureScale.size() && featureScale[0][0] > 0 && DT_I8 != inputDescA.dt && DT_U8_Q != inputDescA.dt) { inputTensorA.set_scale(featureScale[0][0]); @@ -53,6 +54,7 @@ class MatMulCPU : public MatMul { if (featureScale.size() > 0) { outputTensor.set_scale((featureScale.back())[0]); } +#endif std::vector tmpTensor(1, this->temp); CHECK_STATUS(matmul(inputTensors[0], this->p.transpose_a, inputTensors[1], this->p.transpose_b, inputTensorC, tmpTensor, outputTensors[0], &this->archInfo)); diff --git a/inference/engine/include/cpu/non_max_suppression_cpu.hpp b/inference/engine/include/cpu/non_max_suppression_cpu.hpp new file mode 100644 index 00000000..7ce02edb --- /dev/null +++ b/inference/engine/include/cpu/non_max_suppression_cpu.hpp @@ -0,0 +1,45 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _NON_MAX_SUPPRESSION_CPU_H +#define _NON_MAX_SUPPRESSION_CPU_H + +#include "non_max_suppression.hpp" + +class NonMaxSuppressionCPU : public NonMaxSuppression { +public: + NonMaxSuppressionCPU(DataType dt, NonMaxSuppressionParamSpec p) : NonMaxSuppression(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new NonMaxSuppressionCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS(non_max_suppression(inputTensors, this->p, outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + return non_max_suppression_infer_output_size( + inTensors, this->p, outTensors[0], &this->archInfo); + } +}; + +#endif // NON_MAX_SUPPRESSION_CPU_H diff --git a/inference/engine/include/cpu/non_zero_cpu.hpp b/inference/engine/include/cpu/non_zero_cpu.hpp new file mode 100644 index 00000000..ad18446b --- /dev/null +++ b/inference/engine/include/cpu/non_zero_cpu.hpp @@ -0,0 +1,47 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _NON_ZERO_CPU_H +#define _NON_ZERO_CPU_H + +#include "non_zero.hpp" + +class NonZeroCPU : public NonZero { +public: + NonZeroCPU(DataType dt) : NonZero(dt) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new NonZeroCPU(this->dt)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS(non_zero(inputTensors[0], outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + TensorDesc inDesc = inTensors[0]->get_desc(); + int num = tensorNumElements(inDesc); + TensorDesc outDesc = tensor2df(DT_I32, DF_NORMAL, inDesc.nDims, num); + outTensors[0]->resize(outDesc); + return SUCCESS; + } +}; + +#endif // NON_ZERO_CPU_H diff --git a/inference/engine/include/cpu/onehot_cpu.hpp b/inference/engine/include/cpu/onehot_cpu.hpp new file mode 100644 index 00000000..faf90465 --- /dev/null +++ b/inference/engine/include/cpu/onehot_cpu.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ONEHOT_CPU_H +#define _ONEHOT_CPU_H + +#include "onehot.hpp" + +class OneHotCPU : public OneHot { +public: + OneHotCPU(DataType dt, OneHotParamSpec p) : OneHot(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new OneHotCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS(onehot(inputTensors[0], this->p, outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(onehot_infer_output_size( + inTensors[0], this->p, this->dt, outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // ONEHOT_CPU_H diff --git a/inference/engine/include/cpu/power_cpu.hpp b/inference/engine/include/cpu/power_cpu.hpp index e997faaf..fffc074a 100644 --- a/inference/engine/include/cpu/power_cpu.hpp +++ b/inference/engine/include/cpu/power_cpu.hpp @@ -42,7 +42,7 @@ class PowerCPU : public Power { auto inPtr = ((CpuMemory *)(inputTensor.get_memory()))->get_ptr(); auto outPtr = ((CpuMemory *)(outputTensor.get_memory()))->get_ptr(); if (inPtr != outPtr) { - memcpy(outPtr, inPtr, tensorNumBytes(inputDesc)); + UNI_MEMCPY(outPtr, inPtr, tensorNumBytes(inputDesc)); } #endif } else { @@ -53,7 +53,8 @@ class PowerCPU : public Power { EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { - return power_infer_output_size(inTensors[0], outTensors[0], &this->archInfo); + CHECK_STATUS(power_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; } }; diff --git a/inference/engine/include/cpu/preallocated_memory_cpu.hpp b/inference/engine/include/cpu/preallocated_memory_cpu.hpp index 92a0fab5..a54338ad 100644 --- a/inference/engine/include/cpu/preallocated_memory_cpu.hpp +++ b/inference/engine/include/cpu/preallocated_memory_cpu.hpp @@ -18,30 +18,27 @@ class PreAllocatedMemoryCPU : public PreAllocatedMemory { public: - PreAllocatedMemoryCPU(DataType dt, TensorDesc desc) : PreAllocatedMemory(dt, desc) + PreAllocatedMemoryCPU(PreAllocatedMemoryParamSpec p) : PreAllocatedMemory(p) {} std::shared_ptr clone() override { std::shared_ptr mem = - std::shared_ptr(new PreAllocatedMemoryCPU(this->dt, this->desc)); + std::shared_ptr(new PreAllocatedMemoryCPU(this->p)); *mem = *this; return mem; } void run() override { - CHECK_STATUS(preallocated_memory(this->outputTensors[0], &this->archInfo)); + CHECK_STATUS(preallocated_memory(this->p, this->outputTensors[0], &this->archInfo)); } EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { - if (inTensors.size() > 0) { - CHECK_STATUS(NOT_MATCH); - } - outTensors[0]->resize(this->desc); - return SUCCESS; + return preallocated_memory_infer_output_size( + inTensors, this->p, outTensors[0], &this->archInfo); } }; diff --git a/inference/engine/include/cpu/prelu_cpu.hpp b/inference/engine/include/cpu/prelu_cpu.hpp index 5f81a573..3e9dcdc8 100644 --- a/inference/engine/include/cpu/prelu_cpu.hpp +++ b/inference/engine/include/cpu/prelu_cpu.hpp @@ -31,28 +31,35 @@ class PReLUCPU : public PReLU { EE infer_weight_desc() override { auto curOpWs = this->get_weightspec(); - U32 weightNum = 0; - if (curOpWs.weight != nullptr) { - weightNum = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); + U32 weightNum = (curOpWs.weight == nullptr) + ? 0 + : curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); + if (weightNum > 0) { + Tensor weightTensor; + weightTensor.resize(tensor1d(this->dt, weightNum)); + this->weightTensors.push_back(weightTensor); } - if (weightNum == 0) { - CHECK_STATUS(NOT_SUPPORTED); - } - if (weightNum == 1) { - this->preluDesc.propagate_down = true; - } else { - this->preluDesc.propagate_down = false; - } - Tensor weightTensor; - weightTensor.resize(tensor1d(this->dt, weightNum)); - this->weightTensors.push_back(weightTensor); return SUCCESS; } void run() override { - CHECK_STATUS(prelu(this->inputTensors[0], this->weightTensors[0], this->preluDesc, - this->outputTensors[0], &this->archInfo)); + Tensor weight; + if (this->weightTensors.size() > 0) { + weight = this->weightTensors[0]; + } else if (this->inputTensors.size() > 1) { + weight = this->inputTensors[1]; + } else { + UNI_ERROR_LOG("operator:%s type:%s doesn't have weight.\n", this->name.c_str(), + OperatorTypeName()[this->get_type()]); + } + if (weight.length() == 1) { + this->p.propagate_down = true; + } else { + this->p.propagate_down = false; + } + CHECK_STATUS( + prelu(this->inputTensors[0], weight, this->p, this->outputTensors[0], &this->archInfo)); } EE infer_output_tensors_size( diff --git a/inference/engine/include/cpu/quantizelinear_cpu.hpp b/inference/engine/include/cpu/quantizelinear_cpu.hpp new file mode 100644 index 00000000..3bd8125b --- /dev/null +++ b/inference/engine/include/cpu/quantizelinear_cpu.hpp @@ -0,0 +1,72 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _QUANTIZELINEAR_CPU_H +#define _QUANTIZELINEAR_CPU_H + +#include "quantizelinear.hpp" + +class QuantizeLinearCPU : public QuantizeLinear { +public: + QuantizeLinearCPU(DataType dt, QuantizeLinearParamSpec p) : QuantizeLinear(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new QuantizeLinearCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + F32 scale = -1; // default per layer + TensorDesc inputDesc = this->inputTensors[0].get_desc(); + TensorDesc outputDesc = this->outputTensors[0].get_desc(); + if (inputDesc.dt == outputDesc.dt) { + UNI_MEMCPY(get_ptr_from_tensor(this->outputTensors[0], this->archInfo.arch), + get_ptr_from_tensor(this->inputTensors[0], this->archInfo.arch), + tensorNumBytes(this->inputTensors[0].get_desc())); + return; + } + if (featureScale.size() > 0 && featureScale[0].size() > 0 && featureScale[0][0] > 0) { + scale = featureScale[0][0]; + } + CHECK_STATUS( + quantize(this->inputTensors[0], &this->outputTensors[0], &scale, &this->archInfo)); + this->outputTensors[0].set_scale(scale); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + TensorDesc outputDesc = inTensors[0]->get_desc(); + if (this->dt == DT_F32_8Q || this->dt == DT_F16_8Q) { +#ifdef _USE_X86 + outputDesc.dt = p.dt; + + // special case, matmul mvm + if (outputDesc.nDims >= 2 && outputDesc.dims[1] != 1) { + outputDesc.dt = DT_U8_Q; + } +#else + outputDesc.dt = DT_I8; +#endif + } + outTensors[0]->resize(outputDesc); + return SUCCESS; + } +}; + +#endif // _QUANTIZELINEAR_CPU_H diff --git a/inference/engine/include/cpu/range_cpu.hpp b/inference/engine/include/cpu/range_cpu.hpp new file mode 100644 index 00000000..af4bd19f --- /dev/null +++ b/inference/engine/include/cpu/range_cpu.hpp @@ -0,0 +1,73 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RANGE_CPU_H +#define _RANGE_CPU_H + +#include "range.hpp" + +class RangeCPU : public Range { +public: + RangeCPU(DataType dt, RangeParamSpec p) : Range(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = std::shared_ptr(new RangeCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + //CHECK_STATUS(non_zero(inputTensors[0], outputTensors[0], &this->archInfo)); + int idx = outputTensors.size() - 1; + TensorDesc desc = outputTensors[idx].get_desc(); + I32 length = (p.limit - p.start) / p.delta; + switch (desc.dt) { + case DT_I32: { + I32 *ptr = (I32 *)((CpuMemory *)(outputTensors[idx].get_memory()))->get_ptr(); + for (int i = 0; i < length; i++) { + ptr[i] = p.start + p.delta * i; + } + break; + } + default: + UNI_ERROR_LOG("not support Range(%s).\n", OperatorTypeName()[desc.dt]); + break; + } + if (outputTensors.size() > 1) { + U32 *ptr = (U32 *)((CpuMemory *)(outputTensors[0].get_memory()))->get_ptr(); + *ptr = length; + } + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + U32 length = (p.limit - p.start) / p.delta; + TensorDesc desc0 = tensor1d(DT_U32, length); + desc0.df = DF_SCALAR; + desc0.dims[1] = length; + TensorDesc desc1 = tensor1d(p.dt, length); + if (outTensors.size() >= 1) { + outTensors[outTensors.size() - 1]->resize(desc1); + } + if (outTensors.size() == 2) { + outTensors[outTensors.size() - 2]->resize(desc0); + } + return SUCCESS; + } +}; + +#endif // RANGE_CPU_H diff --git a/inference/engine/include/cpu/repeat_cpu.hpp b/inference/engine/include/cpu/repeat_cpu.hpp index 10ec2b0e..8a7becb6 100644 --- a/inference/engine/include/cpu/repeat_cpu.hpp +++ b/inference/engine/include/cpu/repeat_cpu.hpp @@ -39,7 +39,7 @@ class RepeatCPU : public Repeat { if (this->inputTensors.size() > 1) { Tensor inputTensor = this->inputTensors[1]; TensorDesc inputDesc = inputTensor.get_desc(); - I32 *ptr = (I32 *)(((CpuMemory *)(inputTensor.get_memory()))->get_ptr()); + U8 *ptr = (U8 *)(((CpuMemory *)(inputTensor.get_memory()))->get_ptr()); U32 length = tensorNumElements(inputDesc); for (U32 i = 0; i < length; i++) { // end loop diff --git a/inference/engine/include/cpu/reshape_cpu.hpp b/inference/engine/include/cpu/reshape_cpu.hpp index eec8571b..1df80a3d 100644 --- a/inference/engine/include/cpu/reshape_cpu.hpp +++ b/inference/engine/include/cpu/reshape_cpu.hpp @@ -29,6 +29,18 @@ class ReshapeCPU : public Reshape { return mem; } + ReshapeParamSpec get_param(TensorDesc desc) + { + ReshapeParamSpec ps = this->p; + if (ps.num_shape == 0) { + ps.num_shape = desc.dims[0]; + for (int i = 0; i < ps.num_shape; i++) { + ps.shape[i] = desc.dims[desc.nDims + i]; + } + } + return ps; + } + void run() override { Tensor inputTensor = this->inputTensors[0]; @@ -37,9 +49,9 @@ class ReshapeCPU : public Reshape { Tensor tmpOutputTensor = outputTensor; auto inputDesc = inputTensor.get_desc(); auto outputDesc = outputTensor.get_desc(); + auto tmpOutputDesc = outputDesc; // if axis is 8, the mode of a model for reshape is tflite. if (this->p.axis == 8 && outputDesc.nDims == 4) { - auto tmpOutputDesc = outputTensor.get_desc(); tmpOutputDesc.df = DF_NHWC; tmpOutputTensor = this->temp; tmpOutputTensor.resize(tmpOutputDesc); @@ -61,7 +73,6 @@ class ReshapeCPU : public Reshape { // NHWC -> NCHW if (this->p.axis == 8 && outputDesc.nDims == 4) { auto outputDesc = outputTensor.get_desc(); - auto tmpOutputDesc = tmpOutputTensor.get_desc(); void *tmpOutputPtr = ((CpuMemory *)(tmpOutputTensor.get_memory()))->get_ptr(); transformToNCHW(tmpOutputDesc, tmpOutputPtr, outputDesc, ((CpuMemory *)(outputTensor.get_memory()))->get_ptr()); @@ -72,9 +83,11 @@ class ReshapeCPU : public Reshape { EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { - CHECK_STATUS( - reshape_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); - return SUCCESS; + ReshapeParamSpec ps = this->p; + if (ps.num_shape == 0 && inTensors.size() > 1) { + ps = get_param(inTensors[1]->get_desc()); + } + return reshape_infer_output_size(inTensors[0], ps, outTensors[0], &this->archInfo); } U32 infer_tmp_memory_size() override @@ -82,6 +95,9 @@ class ReshapeCPU : public Reshape { U32 bytes = 0; CHECK_STATUS(reshape_infer_forward_tmp_bytes( this->inputTensors[0], this->outputTensors[0], &bytes, &this->archInfo)); + if (this->p.axis == 8) { + bytes += UNI_MAX(this->inputTensors[0].bytes(), this->outputTensors[0].bytes()); + } return bytes; } }; diff --git a/inference/engine/include/cpu/resize_cpu.hpp b/inference/engine/include/cpu/resize_cpu.hpp index cee728bc..93167c32 100644 --- a/inference/engine/include/cpu/resize_cpu.hpp +++ b/inference/engine/include/cpu/resize_cpu.hpp @@ -19,58 +19,35 @@ class ResizeCPU : public Resize { public: - ResizeCPU(DataType paramDT, ResizeParamSpec p) : Resize(paramDT, p) + ResizeCPU(DataType dt, ResizeParamSpec p) : Resize(dt, p) {} std::shared_ptr clone() override { std::shared_ptr mem = - std::shared_ptr(new ResizeCPU(this->paramDT, this->p)); + std::shared_ptr(new ResizeCPU(this->dt, this->p)); *mem = *this; return mem; } void run() override { - CHECK_STATUS(resize(inputTensors[0], temp, outputTensors[0], this->p, &this->archInfo)); + CHECK_STATUS(resize(inputTensors[0], this->p, temp, outputTensors[0], &this->archInfo)); } EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { - U32 bytes; - switch (paramDT) { - case DT_F32: { - CHECK_REQUIREMENT(1 == this->p.scales[0] && 1 == this->p.scales[1]); - CHECK_STATUS(resize_infer_output_size(inTensors[0], this->paramDT, - this->p.scales + 2, outTensors[0], &bytes, &this->archInfo)); - break; - } - case DT_U32: { - CHECK_STATUS(resize_infer_output_size(inTensors[0], this->paramDT, this->p.sizes, - outTensors[0], &bytes, &this->archInfo)); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } - return SUCCESS; + return resize_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo); } U32 infer_tmp_memory_size() override { - U32 size = 0; - TensorDesc inputDesc = inputTensors[0].get_desc(); - if (DF_NCHW == inputDesc.df && (IS_ARM(archInfo.arch) || IS_X86(archInfo.arch))) { - U32 paddedC = (inputDesc.dims[2] + 7) / 8 * 8; - TensorDesc outputDesc = outputTensors[0].get_desc(); - inputDesc.dims[2] = paddedC; - outputDesc.dims[2] = paddedC; - size = tensorNumBytes(inputDesc) + tensorNumBytes(outputDesc); - } - return size; + U32 bytes = 0; + CHECK_STATUS(resize_infer_forward_tmp_bytes( + this->inputTensors[0], this->p, this->outputTensors[0], &bytes, &this->archInfo)); + return bytes; } }; -#endif // _RESIZECPU_H +#endif // _RESIZE_CPU_H diff --git a/inference/engine/include/cpu/rnn_cpu.hpp b/inference/engine/include/cpu/rnn_cpu.hpp index e0691525..70ab5eab 100644 --- a/inference/engine/include/cpu/rnn_cpu.hpp +++ b/inference/engine/include/cpu/rnn_cpu.hpp @@ -36,17 +36,17 @@ class RNNCPU : public RNNCellCPU { U8 *state = (U8 *)get_ptr_from_tensor(this->temp, this->archInfo.arch); TensorDesc desc = inputTensor.get_desc(); int batch = desc.dims[desc.nDims - 1]; - I32 num = p.biDirection ? 2 : 1; - I32 column = this->p.numProjection > 0 ? this->p.numProjection : this->p.numOutput; - U32 ch_size = (this->p.numOutput + column) * bytesOf(desc.dt); + I32 num = p.bi_direction ? 2 : 1; + I32 column = this->p.num_projection > 0 ? this->p.num_projection : this->p.num_outputs; + U32 ch_size = (this->p.num_outputs + column) * bytesOf(desc.dt); if (this->inputTensors.size() == 1) { // bi-direction rnn has forward-states and backward-states - memset(state, 0, batch * num * ch_size); + UNI_MEMSET(state, 0, batch * num * ch_size); } else if (this->inputTensors.size() == 2) { if (num != 1) { UNI_ERROR_LOG("currently not support to set bi-direction RNN's h or c.\n"); } - memcpy(state, get_ptr_from_tensor(this->inputTensors[1], this->archInfo.arch), + UNI_MEMCPY(state, get_ptr_from_tensor(this->inputTensors[1], this->archInfo.arch), tensorNumBytes(this->inputTensors[1].get_desc())); } else if (this->inputTensors.size() == 3) { if (num != 1) { @@ -59,8 +59,8 @@ class RNNCPU : public RNNCellCPU { U32 input_c_tile = tensorNumBytes(this->inputTensors[2].get_desc()) / batch; for (int i = 0; i < batch; i++) { U8 *ptr = state + i * ch_size; - memcpy(ptr, c + input_c_tile * i, input_c_tile); - memcpy(ptr + c_size, h + input_h_tile * i, input_h_tile); + UNI_MEMCPY(ptr, c + input_c_tile * i, input_c_tile); + UNI_MEMCPY(ptr + c_size, h + input_h_tile * i, input_h_tile); } } @@ -69,7 +69,7 @@ class RNNCPU : public RNNCellCPU { tmpTensor, this->outputTensors, &this->archInfo)); if (this->outputTensors.size() == 2) { - memcpy(get_ptr_from_tensor(this->outputTensors[1], this->archInfo.arch), state, + UNI_MEMCPY(get_ptr_from_tensor(this->outputTensors[1], this->archInfo.arch), state, tensorNumBytes(this->outputTensors[1].get_desc())); } else if (this->outputTensors.size() == 3) { U8 *h = (U8 *)get_ptr_from_tensor(this->outputTensors[1], this->archInfo.arch); @@ -79,8 +79,8 @@ class RNNCPU : public RNNCellCPU { U32 output_c_tile = tensorNumBytes(this->outputTensors[2].get_desc()) / batch; for (int i = 0; i < batch; i++) { U8 *ptr = state + i * ch_size; - memcpy(c + output_c_tile * i, ptr, output_c_tile); - memcpy(h + output_h_tile * i, ptr + c_size, output_h_tile); + UNI_MEMCPY(c + output_c_tile * i, ptr, output_c_tile); + UNI_MEMCPY(h + output_h_tile * i, ptr + c_size, output_h_tile); } } } @@ -89,16 +89,12 @@ class RNNCPU : public RNNCellCPU { std::vector inTensors, std::vector outTensors) override { TensorDesc inputDesc = inTensors[0]->get_desc(); - - if (inputDesc.nDims < 3) { - CHECK_STATUS(NOT_MATCH); - } + CHECK_REQUIREMENT(inputDesc.nDims >= 3); this->xDim = inputDesc.dims[inputDesc.nDims - 3]; for (U32 i = 0; i < inputDesc.nDims - 3; ++i) { xDim *= inputDesc.dims[i]; } - CHECK_STATUS(rnn_infer_output_size(inTensors, this->p, outTensors, &this->archInfo)); - return SUCCESS; + return rnn_infer_output_size(inTensors, this->p, outTensors, &this->archInfo); } U32 infer_tmp_memory_size() override diff --git a/inference/engine/include/cpu/rnncell_cpu.hpp b/inference/engine/include/cpu/rnncell_cpu.hpp index fe2b13f7..d595876f 100644 --- a/inference/engine/include/cpu/rnncell_cpu.hpp +++ b/inference/engine/include/cpu/rnncell_cpu.hpp @@ -43,7 +43,7 @@ class RNNCellCPU : public RNNCell { tmpOffset = xTensor.bytes(); } CHECK_STATUS(rnncell(xTensor, this->weightTensors, this->biasTensors, stateTensor, this->p, - this->xDim, this->p.numOutput, tmpOffset, tmpTensor, hTensor, &this->archInfo)); + this->xDim, this->p.num_outputs, tmpOffset, tmpTensor, hTensor, &this->archInfo)); } EE infer_output_tensors_size( @@ -96,14 +96,14 @@ class RNNCellCPU : public RNNCell { EE infer_weight_desc() override { - int directions = (this->p.biDirection) ? 2 : 1; + int directions = (this->p.bi_direction) ? 2 : 1; int weightNum, biasNum, column; - if (this->p.numProjection > 0) { + if (this->p.num_projection > 0) { weightNum = biasNum = 2; - column = this->p.numProjection; + column = this->p.num_projection; } else { weightNum = biasNum = 1; - column = this->p.numOutput; + column = this->p.num_outputs; } int gates = 0; switch (this->p.mode) { @@ -121,12 +121,12 @@ class RNNCellCPU : public RNNCell { return NOT_SUPPORTED; } U32 filterRow = gates * column; - U32 filterCol = this->xDim + this->p.numOutput; + U32 filterCol = this->xDim + this->p.num_outputs; std::vector weight_desc(2), bias_desc(2); weight_desc[0] = tensor2df(this->dt, DF_NK, filterRow, filterCol); - weight_desc[1] = tensor2df(this->dt, DF_NK, this->p.numOutput, this->p.numProjection); + weight_desc[1] = tensor2df(this->dt, DF_NK, this->p.num_outputs, this->p.num_projection); bias_desc[0] = tensor1d(this->dt, filterRow); - bias_desc[1] = tensor1d(this->dt, this->p.numOutput); + bias_desc[1] = tensor1d(this->dt, this->p.num_outputs); this->weightTensors = std::vector(directions * weightNum); this->biasTensors = std::vector(directions * biasNum); for (int i = 0, wid = 0, vid = 0; i < directions; i++) { diff --git a/inference/engine/include/cpu/roialign_cpu.hpp b/inference/engine/include/cpu/roialign_cpu.hpp new file mode 100644 index 00000000..c142ede1 --- /dev/null +++ b/inference/engine/include/cpu/roialign_cpu.hpp @@ -0,0 +1,53 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ROTALIGN_CPU_H +#define _ROTALIGN_CPU_H + +#include "roialign.hpp" + +class RoIAlignCPU : public RoIAlign { +public: + RoIAlignCPU(DataType dt, RoIAlignParamSpec p) : RoIAlign(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new RoIAlignCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS(roialign(inputTensors, this->p, this->temp, outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS(roialign_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } + + U32 infer_tmp_memory_size() override + { + U32 bytes = 0; + CHECK_STATUS(roialign_infer_forward_tmp_bytes( + this->inputTensors[0], this->outputTensors[0], &bytes, &this->archInfo)); + return bytes; + } +}; + +#endif // ROTALIGN_CPU_H diff --git a/inference/engine/include/cpu/scale_cpu.hpp b/inference/engine/include/cpu/scale_cpu.hpp index 669c11ae..d2a54722 100644 --- a/inference/engine/include/cpu/scale_cpu.hpp +++ b/inference/engine/include/cpu/scale_cpu.hpp @@ -24,7 +24,7 @@ class ScaleCPU : public Scale { std::shared_ptr clone() override { std::shared_ptr mem = - std::shared_ptr(new ScaleCPU(this->dt, this->p, this->numChannels)); + std::shared_ptr(new ScaleCPU(this->dt, this->p, 0)); *mem = *this; return mem; } diff --git a/inference/engine/include/cpu/shape_cpu.hpp b/inference/engine/include/cpu/shape_cpu.hpp index 10df2bc7..a111f749 100644 --- a/inference/engine/include/cpu/shape_cpu.hpp +++ b/inference/engine/include/cpu/shape_cpu.hpp @@ -33,15 +33,23 @@ class ShapeCPU : public Shape { Tensor inputTensor = this->inputTensors[0]; TensorDesc inputDesc = inputTensor.get_desc(); Tensor outputTensor = this->outputTensors[0]; - UNI_MEMCPY(((CpuMemory *)(outputTensor.get_memory()))->get_ptr(), inputDesc.dims, - inputDesc.nDims * sizeof(U32)); + U32 *ptr = (U32 *)((CpuMemory *)(outputTensor.get_memory()))->get_ptr(); + for (U32 i = 0; i < inputDesc.nDims; i++) { + ptr[i] = inputDesc.dims[inputDesc.nDims - 1 - i]; + } } EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { TensorDesc inputDesc = inTensors[0]->get_desc(); - TensorDesc outputDesc = tensor1d(DT_U32, inputDesc.nDims); + TensorDesc outputDesc; + outputDesc.dt = DT_U32; + outputDesc.nDims = 1; + outputDesc.dims[0] = inputDesc.nDims; + for (U32 i = 0; i < inputDesc.nDims; i++) { + outputDesc.dims[outputDesc.nDims + i] = inputDesc.dims[inputDesc.nDims - 1 - i]; + } outTensors[0]->resize(outputDesc); return SUCCESS; } diff --git a/inference/engine/include/cpu/shared_weight_cpu.hpp b/inference/engine/include/cpu/shared_weight_cpu.hpp index eec5ec9e..f73a28b6 100644 --- a/inference/engine/include/cpu/shared_weight_cpu.hpp +++ b/inference/engine/include/cpu/shared_weight_cpu.hpp @@ -61,12 +61,12 @@ class SharedWeightCPU : public SharedWeight { U32 weightBytes = modelWeightTensor.bytes(); modelWeightTensor.alloc(); if (modelPtr != nullptr) { - memcpy( + UNI_MEMCPY( ((CpuMemory *)(modelWeightTensor.get_memory()))->get_ptr(), modelPtr, weightBytes); *modelPtrShared = std::shared_ptr(*modelPtrShared, modelPtr + weightBytes); } else { auto curOpWs = this->get_weightspec(); - memcpy(((CpuMemory *)(modelWeightTensor.get_memory()))->get_ptr(), curOpWs.weight, + UNI_MEMCPY(((CpuMemory *)(modelWeightTensor.get_memory()))->get_ptr(), curOpWs.weight, weightBytes); } this->weightTensors.push_back(modelWeightTensor); diff --git a/inference/engine/include/cpu/slice_cpu.hpp b/inference/engine/include/cpu/slice_cpu.hpp index 4321b20a..2f618572 100644 --- a/inference/engine/include/cpu/slice_cpu.hpp +++ b/inference/engine/include/cpu/slice_cpu.hpp @@ -39,14 +39,7 @@ class SliceCPU : public Slice { EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { - CHECK_STATUS(slice_infer_output_size(inTensors[0], this->p, outTensors, &this->archInfo)); - auto outDesc = outTensors[0]->get_desc(); - if (outDesc.nDims == 3 && outDesc.dims[1] == 1 && outDesc.dims[2] == 1) { - outDesc.nDims = 2; - outDesc.df = DF_NORMAL; - outTensors[0]->resize(outDesc); - } - return SUCCESS; + return slice_infer_output_size(inTensors[0], this->p, outTensors, &this->archInfo); } }; diff --git a/inference/engine/include/cpu/softmax_cpu.hpp b/inference/engine/include/cpu/softmax_cpu.hpp index 15650c4f..bff819b5 100644 --- a/inference/engine/include/cpu/softmax_cpu.hpp +++ b/inference/engine/include/cpu/softmax_cpu.hpp @@ -38,9 +38,7 @@ class SoftmaxCPU : public Softmax { EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { - CHECK_STATUS( - softmax_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); - return SUCCESS; + return softmax_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo); } }; diff --git a/inference/engine/include/cpu/space2depth_cpu.hpp b/inference/engine/include/cpu/space2depth_cpu.hpp new file mode 100644 index 00000000..7bf54335 --- /dev/null +++ b/inference/engine/include/cpu/space2depth_cpu.hpp @@ -0,0 +1,46 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _SPACE2DEPTH_CPU_H +#define _SPACE2DEPTH_CPU_H + +#include "space2depth.hpp" + +class Space2DepthCPU : public Space2Depth { +public: + Space2DepthCPU(DataType dt, Space2DepthParamSpec p) : Space2Depth(dt, p) + {} + + std::shared_ptr clone() override + { + std::shared_ptr mem = + std::shared_ptr(new Space2DepthCPU(this->dt, this->p)); + *mem = *this; + return mem; + } + + void run() override + { + CHECK_STATUS(space2depth(inputTensors[0], this->p, outputTensors[0], &this->archInfo)); + } + + EE infer_output_tensors_size( + std::vector inTensors, std::vector outTensors) override + { + CHECK_STATUS( + space2depth_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); + return SUCCESS; + } +}; + +#endif // SPACE2DEPTH_CPU_H diff --git a/inference/engine/include/cpu/splice_cpu.hpp b/inference/engine/include/cpu/splice_cpu.hpp index e05b858f..bbb87424 100644 --- a/inference/engine/include/cpu/splice_cpu.hpp +++ b/inference/engine/include/cpu/splice_cpu.hpp @@ -44,8 +44,8 @@ class SpliceCPU : public Splice { this->transform_filter(); } EmbedParamSpec embedParamSpec; - embedParamSpec.input_dim = this->inputFrameSize; - embedParamSpec.num_output = inputDesc.dims[0]; + embedParamSpec.num_inputs = this->inputFrameSize; + embedParamSpec.num_outputs = inputDesc.dims[0]; embedParamSpec.transpose = false; CHECK_STATUS(embedding(this->weightTensors[0], inputTensor, embedParamSpec, this->temp, outputTensor, &this->archInfo)); diff --git a/inference/engine/include/cpu/tdnn_convolution_cpu.hpp b/inference/engine/include/cpu/tdnn_convolution_cpu.hpp index a83167ff..196fcd2c 100644 --- a/inference/engine/include/cpu/tdnn_convolution_cpu.hpp +++ b/inference/engine/include/cpu/tdnn_convolution_cpu.hpp @@ -35,9 +35,9 @@ class TdnnConvolutionCPU : public ConvolutionCPU { UNI_ERROR_LOG("TdnnCPU currently not support time context is decreasing order\n"); } } - ConvolutionMode convMode = Convolution_Pointwise; + ConvolutionMode convMode = CONVOLUTION_POINTWISE; if (dilation > 1) { - convMode = Convolution_Dilation; + convMode = CONVOLUTION_DILATION; } this->p = createConvolutionParamSpec(1, 1, this->tdnn.num_context, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, dilation, 1, this->tdnn.num_outputs, convMode); diff --git a/inference/engine/include/cpu/tdnn_fully_connected_cpu.hpp b/inference/engine/include/cpu/tdnn_fully_connected_cpu.hpp index 85c11226..1eddb1c1 100644 --- a/inference/engine/include/cpu/tdnn_fully_connected_cpu.hpp +++ b/inference/engine/include/cpu/tdnn_fully_connected_cpu.hpp @@ -80,7 +80,7 @@ class TdnnFullyConnectedCPU : public FullyConnectedCPU { j < this->outputFrameSize - this->slide_size; j++) { U8 *dst = output + (i * this->outputFrameSize + j) * tileSize; U8 *src = dst + tileSize; - memcpy(dst, src, tileSize); + UNI_MEMCPY(dst, src, tileSize); } } } @@ -104,8 +104,8 @@ class TdnnFullyConnectedCPU : public FullyConnectedCPU { ((CpuMemory *)spliceResult.get_memory())->set_shared_ptr(spliceBuffer); EmbedParamSpec embedParamSpec; - embedParamSpec.input_dim = this->inputFrameSize; - embedParamSpec.num_output = inputDesc.dims[0]; + embedParamSpec.num_inputs = this->inputFrameSize; + embedParamSpec.num_outputs = inputDesc.dims[0]; embedParamSpec.transpose = false; CHECK_STATUS(embedding(this->index, inputTensor, embedParamSpec, this->temp, spliceResult, &this->archInfo)); diff --git a/inference/engine/include/cpu/topk_cpu.hpp b/inference/engine/include/cpu/topk_cpu.hpp index 99d0ce3e..936651bd 100644 --- a/inference/engine/include/cpu/topk_cpu.hpp +++ b/inference/engine/include/cpu/topk_cpu.hpp @@ -28,19 +28,28 @@ class TopKCPU : public TopK { return mem; } + TopKParamSpec get_param(TensorDesc desc) + { + TopKParamSpec lp = this->p; + if (lp.k == 0) { + lp.k = desc.dims[desc.nDims]; + } + return lp; + } void run() override { - Tensor inputTensor = this->inputTensors[0]; - Tensor outputTensor = this->outputTensors[0]; - Tensor outputIndicesTensor = this->outputTensors[1]; - CHECK_STATUS(topk( - inputTensor, this->p, this->temp, outputTensor, outputIndicesTensor, &this->archInfo)); + CHECK_STATUS(topk(inputTensors[0], this->p, this->temp, outputTensors[0], outputTensors[1], + &this->archInfo)); } EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { - CHECK_STATUS(topk_infer_output_size( - inTensors[0], this->p, outTensors[0], outTensors[1], &this->archInfo)); + TopKParamSpec lp = this->p; + if (lp.k == 0 && inTensors.size() > 1) { + lp = get_param(inTensors[1]->get_desc()); + } + CHECK_STATUS( + topk_infer_output_size(inTensors[0], lp, outTensors[0], outTensors[1], &this->archInfo)); return SUCCESS; } diff --git a/inference/engine/include/cpu/where_cpu.hpp b/inference/engine/include/cpu/where_cpu.hpp index 86d63c67..bba916ef 100644 --- a/inference/engine/include/cpu/where_cpu.hpp +++ b/inference/engine/include/cpu/where_cpu.hpp @@ -14,7 +14,6 @@ #ifndef _WHERE_CPU_H #define _WHERE_CPU_H -#include #include "where.hpp" class WhereCPU : public Where { @@ -31,31 +30,14 @@ class WhereCPU : public Where { void run() override { - CHECK_STATUS(where(this->inputTensors[1], this->inputTensors[0], this->biasTensors[0], + CHECK_STATUS(where(this->inputTensors[0], this->inputTensors[1], this->inputTensors[2], this->outputTensors[0], &this->archInfo)); } EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { - //inTensors[0] is condition now, 2021/2/3 - CHECK_STATUS(where_infer_output_size( - inTensors[inTensors.size() - 1], outTensors[0], &this->archInfo)); - return SUCCESS; - } - - EE infer_weight_desc() override - { - auto curOpWs = this->get_weightspec(); - int weightBytes = curOpWs.bytes_of_weight; - int Lw = sqrt(weightBytes / bytesOf(curOpWs.mdt)); - int biasBytes = curOpWs.bytes_of_vec; - int Lb = biasBytes / bytesOf(curOpWs.mdt); - this->weightTensors = std::vector(1); - this->weightTensors[0].resize(tensor4d(this->dt, 1, 1, Lw, Lw)); - this->biasTensors = std::vector(1); - this->biasTensors[0].resize(tensor2d(this->dt, 1, Lb)); - return SUCCESS; + return where_infer_output_size(inTensors[1], inTensors[2], outTensors[0], &this->archInfo); } }; diff --git a/inference/engine/include/cumsum.hpp b/inference/engine/include/cumsum.hpp new file mode 100644 index 00000000..dc91a99e --- /dev/null +++ b/inference/engine/include/cumsum.hpp @@ -0,0 +1,35 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _CUMSUM_H +#define _CUMSUM_H + +#include "operator.hpp" + +class CumSum : public Operator { +public: + explicit CumSum(DataType dt, CumSumParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_CumSum; + } + +protected: + CumSumParamSpec p; +}; +#endif // _CUMSUM_H diff --git a/inference/engine/include/data_loader.hpp b/inference/engine/include/data_loader.hpp index bd88c9ca..7dadacee 100644 --- a/inference/engine/include/data_loader.hpp +++ b/inference/engine/include/data_loader.hpp @@ -20,6 +20,8 @@ int string_end_with(std::string s, std::string sub); +bool is_directory(std::string path); + void get_files(std::string directoryName, std::vector &files); std::vector load_fake_data(std::vector dataDesc); diff --git a/inference/engine/include/detection_output.hpp b/inference/engine/include/detection_output.hpp index cdc92799..ba52f8a6 100644 --- a/inference/engine/include/detection_output.hpp +++ b/inference/engine/include/detection_output.hpp @@ -46,9 +46,7 @@ class DetectionOutput : public Operator { EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { - CHECK_STATUS( - detectionoutput_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo)); - return SUCCESS; + return detectionoutput_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo); } protected: diff --git a/inference/engine/include/eltwise.hpp b/inference/engine/include/eltwise.hpp index 35a13270..85d14b9e 100644 --- a/inference/engine/include/eltwise.hpp +++ b/inference/engine/include/eltwise.hpp @@ -18,9 +18,9 @@ class Eltwise : public Operator { public: - Eltwise(EltwiseParamSpec eltwiseDesc) + Eltwise(EltwiseParamSpec p) { - this->eltwiseDesc = eltwiseDesc; + this->p = p; } OperatorType get_type() override @@ -37,6 +37,6 @@ class Eltwise : public Operator { } protected: - EltwiseParamSpec eltwiseDesc; + EltwiseParamSpec p; }; #endif // _ELTWISE_H diff --git a/inference/engine/include/equal.hpp b/inference/engine/include/equal.hpp index 3fbcd179..d1e8c243 100644 --- a/inference/engine/include/equal.hpp +++ b/inference/engine/include/equal.hpp @@ -29,11 +29,6 @@ class Equal : public WeightOperator { return OT_Equal; } - bool can_input_output_the_same() override - { - return false; - } - protected: EqualParamSpec p; }; diff --git a/inference/engine/include/factory.hpp b/inference/engine/include/factory.hpp index 085b43c9..8cc0c6ae 100644 --- a/inference/engine/include/factory.hpp +++ b/inference/engine/include/factory.hpp @@ -15,11 +15,10 @@ #define _FACTORY_H #include "operator.hpp" -#include "tensor_computing.h" #define NOT_SUPPORT \ Operator *cep = NULL; \ - CHECK_STATUS(NOT_SUPPORTED); + UNI_ERROR_LOG("not support to create operator in %s.\n", __FUNCTION__); #define NOT_USE0() #define NOT_USE1(a1) \ { \ @@ -100,11 +99,12 @@ class Factory { virtual std::shared_ptr createMatMul(DataType dt, MatMulParamSpec p) = 0; - virtual std::shared_ptr createLayerNorm(DataType dt, U32 weightNum) = 0; + virtual std::shared_ptr createLayerNorm( + DataType dt, LayerNormParamSpec p, U32 weightNum) = 0; virtual std::shared_ptr createReshape(DataType dt, ReshapeParamSpec p) = 0; - virtual std::shared_ptr createResize(DataType paramDT, ResizeParamSpec p) = 0; + virtual std::shared_ptr createResize(DataType dt, ResizeParamSpec p) = 0; virtual std::shared_ptr createSlice(DataType dt, SliceParamSpec p) = 0; @@ -131,7 +131,7 @@ class Factory { virtual std::shared_ptr createBilateralSliceApply(BilateralSliceApplyParamSpec p) = 0; - virtual std::shared_ptr createPreAllocatedMemory(DataType dt, TensorDesc desc) = 0; + virtual std::shared_ptr createPreAllocatedMemory(PreAllocatedMemoryParamSpec p) = 0; virtual std::shared_ptr createSharedWeight(DataType dt, TensorDesc desc, @@ -186,8 +186,6 @@ class Factory { virtual std::shared_ptr createCast(DataType dt, CastParamSpec p) = 0; - virtual std::shared_ptr createEqual(DataType dt, EqualParamSpec p) = 0; - virtual std::shared_ptr createExpand(DataType dt, ExpandParamSpec p) = 0; virtual std::shared_ptr createScatter(DataType dt, ScatterParamSpec p) = 0; @@ -198,23 +196,31 @@ class Factory { virtual std::shared_ptr createInstanceNorm(DataType dt, InstanceNormParamSpec p) = 0; - virtual std::shared_ptr createRoIAlign(RoIAlignParamSpec p) = 0; + virtual std::shared_ptr createRoIAlign(DataType dt, RoIAlignParamSpec p) = 0; virtual std::shared_ptr createGenerateProposals( DataType dt, GenerateProposalsParamSpec p) = 0; virtual std::shared_ptr createGAT(DataType dt, GATParamSpec p) = 0; - DataType get_float_precision(DataType dt) - { - DataType ret = dt; - if (dt == DT_F16_8Q) { - ret = DT_F16; - } else if (dt == DT_F32_8Q) { - ret = DT_F32; - } - return ret; - } + virtual std::shared_ptr createQuantizeLinear( + DataType dt, QuantizeLinearParamSpec p) = 0; + + virtual std::shared_ptr createGridSample(DataType dt, GridSampleParamSpec p) = 0; + + virtual std::shared_ptr createOneHot(DataType dt, OneHotParamSpec p) = 0; + + virtual std::shared_ptr createCumSum(DataType dt, CumSumParamSpec p) = 0; + + virtual std::shared_ptr createNonMaxSuppression( + DataType dt, NonMaxSuppressionParamSpec p) = 0; + + virtual std::shared_ptr createConstantOfShape( + DataType dt, ConstantOfShapeParamSpec p) = 0; + + virtual std::shared_ptr createNonZero(DataType dt) = 0; + + virtual std::shared_ptr createRange(DataType dt, RangeParamSpec p) = 0; std::shared_ptr createOperators(OperatorSpec curOps, DataType dt, @@ -239,14 +245,28 @@ class Factory { if (dt == DT_F32_8Q || dt == DT_F16_8Q) { #ifndef _USE_INT8 UNI_ERROR_LOG("this library not support to inference int8, please recompile with " - "--int8=on. Only Armv7+ and x86 AVX512-VNNI cpu support.\n"); + "--int8=on. Only Armv7+ and x86 AVX512/AVX512-VNNI cpu support.\n"); #endif } OperatorType opType = curOps.type; - DataType dtNoQ = get_float_precision(dt); + DataType dtNoQ = (dt == DT_F16_8Q) ? DT_F16 : ((dt == DT_F32_8Q) ? DT_F32 : dt); std::string opName = curOps.name; std::shared_ptr op; auto curPs = curOps.ps; + std::map activationMap = {{OT_Relu6, ACTIVATION_RELU6}, + {OT_HSwish, ACTIVATION_H_SWISH}, {OT_HSwishNoDiv, ACTIVATION_H_SWISH_NODIV}, + {OT_Sigmoid, ACTIVATION_SIGMOID}, {OT_HSigmoid, ACTIVATION_H_SIGMOID}, + {OT_Gelu, ACTIVATION_GELU}, {OT_TanH, ACTIVATION_TANH}, {OT_Mish, ACTIVATION_MISH}, + {OT_Greater, ACTIVATION_GREATER}, {OT_Exp, ACTIVATION_EXP}, + {OT_SoftPlus, ACTIVATION_SOFTPLUS}, {OT_Abs, ACTIVATION_ABS}, {OT_Sign, ACTIVATION_SIGN}, + {OT_Not, ACTIVATION_NOT}, {OT_Log, ACTIVATION_LOG}, {OT_Neg, ACTIVATION_NEG}, + {OT_Round, ACTIVATION_ROUND}, {OT_Floor, ACTIVATION_FLOOR}, {OT_Ceil, ACTIVATION_CEIL}, + {OT_Swish, ACTIVATION_SWISH}, {OT_Reciprocal, ACTIVATION_RECIPROCAL}}; + if (activationMap.find(opType) != activationMap.end()) { + ActivationParamSpec activationDesc; + activationDesc.mode = activationMap[opType]; + return createActivation(activationDesc); + } switch (opType) { case OT_Conv: { ActivationParamSpec dwActiveDesc; @@ -288,60 +308,6 @@ class Factory { op = createActivation(activationDesc); break; } - case OT_Relu6: { - ActivationParamSpec activationDesc; - activationDesc.mode = ACTIVATION_RELU6; - op = createActivation(activationDesc); - break; - } - case OT_HSwish: { - ActivationParamSpec activationDesc; - activationDesc.mode = ACTIVATION_H_SWISH; - op = createActivation(activationDesc); - break; - } - case OT_HSwishNoDiv: { - ActivationParamSpec activationDesc; - activationDesc.mode = ACTIVATION_H_SWISH_NODIV; - op = createActivation(activationDesc); - break; - } - case OT_Sigmoid: { - ActivationParamSpec activationDesc; - activationDesc.mode = ACTIVATION_SIGMOID; - op = createActivation(activationDesc); - break; - } - case OT_HSigmoid: { - ActivationParamSpec activationDesc; - activationDesc.mode = ACTIVATION_H_SIGMOID; - op = createActivation(activationDesc); - break; - } - case OT_Gelu: { - ActivationParamSpec activationDesc; - activationDesc.mode = ACTIVATION_GELU; - op = createActivation(activationDesc); - break; - } - case OT_TanH: { - ActivationParamSpec activationDesc; - activationDesc.mode = ACTIVATION_TANH; - op = createActivation(activationDesc); - break; - } - case OT_Mish: { - ActivationParamSpec activationDesc; - activationDesc.mode = ACTIVATION_MISH; - op = createActivation(activationDesc); - break; - } - case OT_Greater: { - ActivationParamSpec activationDesc; - activationDesc.mode = ACTIVATION_GREATER; - op = createActivation(activationDesc); - break; - } case OT_Concat: { op = createConcat(curPs.concat_spec); break; @@ -367,7 +333,7 @@ class Factory { break; } case OT_LayerNorm: { - op = createLayerNorm(dt, 0); + op = createLayerNorm(dt, curPs.ln_spec, 0); break; } case OT_Reshape: { @@ -375,12 +341,7 @@ class Factory { break; } case OT_Resize: { - if (curPs.resize_spec.num_sizes > 0) { - op = createResize(DT_U32, curPs.resize_spec); - } else { - CHECK_REQUIREMENT(curPs.resize_spec.num_scales == 4); - op = createResize(DT_F32, curPs.resize_spec); - } + op = createResize(dt, curPs.resize_spec); break; } case OT_Slice: { @@ -424,10 +385,7 @@ class Factory { break; } case OT_PreAllocatedMemory: { - PreAllocatedMemoryParamSpec curPreAllocatedMemoryParamSpec = - curOps.ps.preallocated_memory_spec; - TensorDesc desc = curPreAllocatedMemoryParamSpec.desc; - op = createPreAllocatedMemory(dtNoQ, desc); + op = createPreAllocatedMemory(curOps.ps.preallocated_memory_spec); break; } case OT_SharedWeight: { @@ -527,18 +485,6 @@ class Factory { op = createWhere(dt); break; } - case OT_SoftPlus: { - ActivationParamSpec activationDesc; - activationDesc.mode = ACTIVATION_SOFTPLUS; - op = createActivation(activationDesc); - break; - } - case OT_Exp: { - ActivationParamSpec activationDesc; - activationDesc.mode = ACTIVATION_EXP; - op = createActivation(activationDesc); - break; - } case OT_Tdnn: { op = createTdnn(dt, curPs.tdnn_spec); break; @@ -551,26 +497,10 @@ class Factory { op = createTopK(dt, curPs.topk_spec); break; } - case OT_Abs: { - ActivationParamSpec activationDesc; - activationDesc.mode = ACTIVATION_ABS; - op = createActivation(activationDesc); - break; - } case OT_Cast: { op = createCast(dt, curPs.cast_spec); break; } - case OT_Equal: { - op = createEqual(dt, curPs.equal_spec); - break; - } - case OT_Sign: { - ActivationParamSpec activationDesc; - activationDesc.mode = ACTIVATION_SIGN; - op = createActivation(activationDesc); - break; - } case OT_InstanceNorm: { op = createInstanceNorm(dt, curPs.in_spec); break; @@ -591,38 +521,52 @@ class Factory { op = createSelect(dt); break; } - case OT_Not: { - ActivationParamSpec activationDesc; - activationDesc.mode = ACTIVATION_NOT; - op = createActivation(activationDesc); + case OT_GAT: { + op = createGAT(dt, curPs.gat_spec); break; } - case OT_Log: { - ActivationParamSpec activationDesc; - activationDesc.mode = ACTIVATION_LOG; - op = createActivation(activationDesc); + case OT_RoIAlign: { + op = createRoIAlign(dt, curPs.roialign_spec); break; } - case OT_Neg: { - ActivationParamSpec activationDesc; - activationDesc.mode = ACTIVATION_NEG; - op = createActivation(activationDesc); + case OT_GenerateProposals: { + op = createGenerateProposals(dt, curPs.generate_proposals_spec); break; } - case OT_GAT: { - op = createGAT(dt, curPs.gat_spec); + case OT_QuantizeLinear: { + op = createQuantizeLinear(dt, curPs.quant_spec); break; } - case OT_RoIAlign: { - op = createRoIAlign(curPs.roialign_spec); + case OT_GridSample: { + op = createGridSample(dt, curPs.grid_sample_spec); break; } - case OT_GenerateProposals: { - op = createGenerateProposals(dt, curPs.generate_proposals_spec); + case OT_OneHot: { + op = createOneHot(dt, curPs.onehot_spec); + break; + } + case OT_CumSum: { + op = createCumSum(dt, curPs.cumsum_spec); + break; + } + case OT_NonMaxSuppression: { + op = createNonMaxSuppression(dt, curPs.non_max_suppression_spec); + break; + } + case OT_ConstantOfShape: { + op = createConstantOfShape(dt, curPs.constant_of_shape_spec); + break; + } + case OT_NonZero: { + op = createNonZero(dt); + break; + } + case OT_Range: { + op = createRange(dt, curPs.range_spec); break; } default: { - UNI_ERROR_LOG("unsupported layer %s\n", OperatorTypeName()[opType]); + UNI_ERROR_LOG("can not create layer %s.\n", OperatorTypeName()[opType]); break; } } diff --git a/inference/engine/include/generate_proposals.hpp b/inference/engine/include/generate_proposals.hpp index d55fab89..f63c9d24 100644 --- a/inference/engine/include/generate_proposals.hpp +++ b/inference/engine/include/generate_proposals.hpp @@ -34,9 +34,7 @@ class GenerateProposals : public WeightOperator { bool findId = false; this->anchorBlockDim = 4; U32 tensorNum = inTensors.size(); - if (tensorNum != 3) { - CHECK_STATUS(NOT_MATCH); - } + CHECK_REQUIREMENT(tensorNum == 3); for (U32 i = 0; i < tensorNum; i++) { U32 j = (i + 1) % tensorNum; TensorDesc iDesc = inTensors[i]->get_desc(); @@ -60,13 +58,10 @@ class GenerateProposals : public WeightOperator { } } } - if (!findId) { - CHECK_STATUS(NOT_MATCH); - } + CHECK_REQUIREMENT(findId); } protected: - DataType dt; GenerateProposalsParamSpec p; U8 deltaTensorId; U8 logitTensorId; diff --git a/inference/engine/include/grid_sample.hpp b/inference/engine/include/grid_sample.hpp new file mode 100644 index 00000000..ac5193bb --- /dev/null +++ b/inference/engine/include/grid_sample.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _GRID_SAMPLE_H +#define _GRID_SAMPLE_H + +#include "operator.hpp" + +class GridSample : public Operator { +public: + GridSample(DataType dt, GridSampleParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_GridSample; + } + +protected: + GridSampleParamSpec p; +}; + +#endif // _GRID_SAMPLE_H diff --git a/inference/engine/include/image_container.hpp b/inference/engine/include/image_container.hpp index 9792b658..6288a93a 100644 --- a/inference/engine/include/image_container.hpp +++ b/inference/engine/include/image_container.hpp @@ -11,8 +11,8 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#ifndef _IMAGE_CONTAINER_ -#define _IMAGE_CONTAINER_ +#ifndef _IMAGE_CONTAINER_H +#define _IMAGE_CONTAINER_H #include "tensor_desc.h" #include "image_manager.hpp" @@ -49,7 +49,7 @@ class ImageContainer : public ImageManager { { I32 vecId = ImageManager::getImageVecsId(slot, width, height, depth); if (vecId < 0 || vecId >= (I32)images[slot].size()) { - CHECK_STATUS(NOT_MATCH); + UNI_ERROR_LOG("gpu image buffer reuse wrong.\n"); } return *(images[slot][vecId].get()); } diff --git a/inference/engine/include/image_manager.hpp b/inference/engine/include/image_manager.hpp index 5880d5de..ece144c9 100644 --- a/inference/engine/include/image_manager.hpp +++ b/inference/engine/include/image_manager.hpp @@ -11,8 +11,8 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#ifndef _IMAGE_MANAGER_ -#define _IMAGE_MANAGER_ +#ifndef _IMAGE_MANAGER_H +#define _IMAGE_MANAGER_H #include class ImageManager { @@ -32,7 +32,7 @@ class ImageManager { if (width == 0 && height == 0 && depth == 0) { return false; } else if (width == 0 || height == 0 || depth == 0) { - CHECK_STATUS(NOT_MATCH); + UNI_ERROR_LOG("gpu image tensor parameter is wrong.\n"); } if (imageVecs.count(slot) == 0) { std::vector> strs(1, str); diff --git a/inference/engine/include/inference.hpp b/inference/engine/include/inference.hpp index 5f9cfbe9..0db6b1cf 100644 --- a/inference/engine/include/inference.hpp +++ b/inference/engine/include/inference.hpp @@ -18,7 +18,6 @@ #ifdef _USE_GPU #include "gcl.h" #endif -#include "thread_affinity.h" inline std::map extractInputDims(const ModelSpec *ms) { @@ -41,10 +40,9 @@ inline std::shared_ptr createPipelinefromMs( // create ops cnn->initialize_ops(ms); - std::map inputDescMap = extractInputDims(ms); - cnn->loadAlgorithmMap(algorithmMapPath); + std::map inputDescMap = extractInputDims(ms); // assign space for output, tmp, bias, and trans_weight cnn->ready(inputDescMap); @@ -56,12 +54,13 @@ inline std::shared_ptr createPipelinefromMs( inline std::shared_ptr createPipeline( const char *affinityPolicyName, const char *modelPath, const char *algorithmMapPath = "") { - // deserialize model from file + std::shared_ptr pipeline; ModelSpec ms; - CHECK_STATUS(deserialize_model_from_file(modelPath, &ms)); - std::shared_ptr pipeline = createPipelinefromMs(affinityPolicyName, &ms, algorithmMapPath); - CHECK_STATUS(mt_destroy_model(&ms)); + EE ret = deserialize_model_from_file(modelPath, &ms); + if (ret == SUCCESS) { + pipeline = createPipelinefromMs(affinityPolicyName, &ms, algorithmMapPath); + CHECK_STATUS(mt_destroy_model(&ms)); + } return pipeline; } - #endif diff --git a/inference/engine/include/instance_norm.hpp b/inference/engine/include/instance_norm.hpp index fda06c9f..76b1ad11 100644 --- a/inference/engine/include/instance_norm.hpp +++ b/inference/engine/include/instance_norm.hpp @@ -22,7 +22,6 @@ class InstanceNorm : public WeightOperator { { this->dt = dt; this->p = p; - this->numChannels = 0; } OperatorType get_type() override @@ -32,7 +31,6 @@ class InstanceNorm : public WeightOperator { protected: InstanceNormParamSpec p; - U32 numChannels; }; #endif // _INSTANCE_NORM_H diff --git a/inference/engine/include/jump.hpp b/inference/engine/include/jump.hpp index 2932217b..102396d3 100644 --- a/inference/engine/include/jump.hpp +++ b/inference/engine/include/jump.hpp @@ -46,7 +46,7 @@ class Jump : public Operator { // check status if (this->inputTensors.size() > 1) { Tensor inputTensor = this->inputTensors[1]; - I32 *ptr = (I32 *)((CpuMemory *)(inputTensor.get_memory()))->get_ptr(); + U8 *ptr = (U8 *)((CpuMemory *)(inputTensor.get_memory()))->get_ptr(); U32 length = inputTensor.length(); for (U32 i = 0; i < length; i++) { if (ptr[i]) { diff --git a/inference/engine/include/layer_norm.hpp b/inference/engine/include/layer_norm.hpp index f9e27ac0..4b39599e 100644 --- a/inference/engine/include/layer_norm.hpp +++ b/inference/engine/include/layer_norm.hpp @@ -18,9 +18,10 @@ class LayerNorm : public WeightOperator { public: - LayerNorm(DataType dt, U32 weightNum) + LayerNorm(DataType dt, LayerNormParamSpec p, U32 weightNum) { this->dt = dt; + this->p = p; this->weightNum = weightNum; this->hasBias = false; } @@ -31,6 +32,7 @@ class LayerNorm : public WeightOperator { } protected: + LayerNormParamSpec p; U32 weightNum; }; diff --git a/inference/engine/include/memory_tracker.hpp b/inference/engine/include/memory_tracker.hpp index adc0a790..5110846e 100644 --- a/inference/engine/include/memory_tracker.hpp +++ b/inference/engine/include/memory_tracker.hpp @@ -96,7 +96,7 @@ class MemoryTracker { { I32 subSlot = imageManager.getImageVecsId(slot, str[0], str[1], str[2]); if (subSlot < 0) { - CHECK_STATUS(NOT_MATCH); + UNI_ERROR_LOG("gpu image buffer reuse parameter is wrong.\n"); } return subSlot; } diff --git a/inference/engine/include/model.hpp b/inference/engine/include/model.hpp index f9fa654c..e17aa577 100644 --- a/inference/engine/include/model.hpp +++ b/inference/engine/include/model.hpp @@ -15,9 +15,8 @@ #define _MODEL_H #include "operator.hpp" -#include "tensor_desc.h" #include "algorithm_map.h" -#include "thread_affinity.h" +#include "affinity_policy.h" #ifdef _USE_GPU #include "gcl.h" #endif @@ -27,153 +26,40 @@ class Model { Model() {} - Model(AffinityPolicy affinityPolicy, DataType dt, std::string name) - { - this->set_device_info(affinityPolicy); - this->dt = dt; - this->name = name; - std::string deviceName = ""; - if (IS_GPU(this->deviceInfo.schedule)) { -#ifdef _USE_GPU - if (OCLContext::getInstance().handle->useQualcommDev) { - this->deviceInfo.schedule = QUALCOMM; - } -#else - UNI_ERROR_LOG("This library not support ARM MALI/Qualcomm GPU, please rebuild library " - "with --gpu option.\n"); - exit(1); -#endif - } - algorithmMap = std::shared_ptr( - new AlgorithmMap(this->deviceInfo.schedule, name, deviceName, dt)); - } - - void set_runtime_device(int cpuId, int threadId = 0) - { - this->set_runtime_device(cpuId, this->deviceInfo.archs[cpuId], threadId); - } - - void set_runtime_device(int cpuId, Arch arch, int threadId = 0) - { - this->deviceInfo.schedule = arch; - UNI_DEBUG_LOG("Inference use %s.\n", ArchName()[this->deviceInfo.schedule]) - if (cpuId >= 0 && cpuId < this->deviceInfo.cpuNum) { - set_thread_affinity(threadId, &cpuId, 1); - for (auto op : ops) { - op->set_schedule(this->deviceInfo.schedule); - } - } - } - - void set_runtime_device_dynamic(int threadId = 0) - { - set_cpu_dynamic(&this->deviceInfo, threadId); - } - - Arch get_runtime_device() - { - return this->deviceInfo.schedule; - } - - virtual void ready(std::map inputDescMap) - { - infer_output_tensors_size(inputDescMap); - assign_output_tensor(); - - infer_tmp_memory_size(); - assign_tmp_tensor(); - } + explicit Model(AffinityPolicy affinityPolicy, DataType dt, std::string name); + + virtual ~Model() = default; + + virtual void ready(std::map inputDescMap); virtual void run() = 0; #ifdef _USE_INT8 - virtual U32 find_next_dynamic_scale_op(std::vector calibratedOpIdx, U32 startIdx) - { - CHECK_REQUIREMENT(startIdx < this->ops.size()) - for (U32 i = startIdx; i < this->ops.size();) { - auto op = this->ops[i]; - if (op->is_dynamic_scale()) { - bool calibrated = false; - for (auto idx : calibratedOpIdx) { - if (i == idx) { - calibrated = true; - break; - } - } - if (!calibrated) { - return i; - } - } - - if (op->get_type() == OT_Repeat || op->get_type() == OT_Jump) { - i = op->get_next_operator_index(); - } else { - i++; - } - } - - return 0; // The first layer should never be quantized - } - - virtual std::shared_ptr get_operator_by_index(U32 index) - { - return this->ops[index]; - } - - virtual void run_till_breakpoint(U32 opIdx) - { - CHECK_REQUIREMENT(IS_CPU(this->deviceInfo.schedule)); - for (U32 i = 0; i < this->ops.size();) { - auto op = this->ops[i]; - if (op->get_type() == OT_Repeat || op->get_type() == OT_Jump) { - if (opIdx == i) { - break; - } - i = op->get_next_operator_index(); - } else { - op->run(); - if (opIdx == i) { - break; - } - i++; - } - } - } -#endif + virtual U32 find_next_dynamic_scale_op(std::vector calibratedOpIdx, U32 startIdx); - std::string get_name() - { - return this->name; - } + virtual std::shared_ptr get_operator_by_index(U32 index); - void loadAlgorithmMap(CI8 *path, bool useFileStream = false) - { - std::string algoName = this->algorithmMap->getAlgorithmFileName(); - CI8 *algoInfo = nullptr; - if (IS_GPU(this->deviceInfo.schedule)) { -#ifdef _USE_GPU - algoInfo = gcl_get_algorithm_info(OCLContext::getInstance().handle.get(), algoName); + virtual void run_till_breakpoint(U32 opIdx); #endif - } - if (!algoInfo && useFileStream) { - algoInfo = path; - } - if (algoInfo) { - this->algorithmMap->loadAlgorithmMapFromFileStream(algoInfo); - } else if (path) { - this->algorithmMap->loadAlgorithmMapFromFile(path); - } - } - - void saveAlgorithmMapToFile(std::string algorithmMapPath) - { - this->algorithmMap->saveAlgorithmMapToFile(algorithmMapPath); - } + + void loadAlgorithmMap(CI8 *path, bool useFileStream = false); + + void saveAlgorithmMapToFile(std::string algorithmMapPath); + + void set_runtime_device(int cpuId, int threadId = 0); + + void set_runtime_device(int cpuId, Arch arch, int threadId = 0); + + void set_runtime_device_dynamic(int threadId = 0); + + Arch get_runtime_device(); + + std::string get_name(); protected: + DataType dt; std::vector> ops; DeviceInfo deviceInfo; - DataType dt; std::shared_ptr algorithmMap; virtual EE infer_output_tensors_size(std::map) = 0; @@ -181,29 +67,9 @@ class Model { virtual void infer_tmp_memory_size() = 0; virtual void assign_tmp_tensor() = 0; - virtual bool checkOperator() - { - for (auto op : this->ops) { - if (!op->checkOperator()) { - return false; - } - } - return true; - } - private: std::string name; - void set_device_info(AffinityPolicy affinityPolicy) - { -#ifndef _USE_IOS - this->deviceInfo = get_cpu_info(affinityPolicy); - this->set_runtime_device_dynamic(); -#else - this->deviceInfo.affinityPolicy = affinityPolicy; - this->deviceInfo.schedule = ARM_A76; -#endif - UNI_DEBUG_LOG("Inference use %s.\n", ArchName()[this->deviceInfo.schedule]) - } + void set_device_info(AffinityPolicy affinityPolicy); }; #endif diff --git a/inference/engine/include/non_max_suppression.hpp b/inference/engine/include/non_max_suppression.hpp new file mode 100644 index 00000000..c3e2a65f --- /dev/null +++ b/inference/engine/include/non_max_suppression.hpp @@ -0,0 +1,35 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _NON_MAX_SUPPRESSION_H +#define _NON_MAX_SUPPRESSION_H + +#include "operator.hpp" + +class NonMaxSuppression : public Operator { +public: + explicit NonMaxSuppression(DataType dt, NonMaxSuppressionParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_NonMaxSuppression; + } + +protected: + NonMaxSuppressionParamSpec p; +}; +#endif // _NON_MAX_SUPPRESSION_H diff --git a/inference/engine/include/non_zero.hpp b/inference/engine/include/non_zero.hpp new file mode 100644 index 00000000..fb3b0865 --- /dev/null +++ b/inference/engine/include/non_zero.hpp @@ -0,0 +1,31 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _NON_ZERO_H +#define _NON_ZERO_H + +#include "operator.hpp" + +class NonZero : public Operator { +public: + explicit NonZero(DataType dt) + { + this->dt = dt; + } + + OperatorType get_type() override + { + return OT_NonZero; + } +}; +#endif // _NON_ZERO_H diff --git a/inference/engine/include/ocl/activation_ocl.hpp b/inference/engine/include/ocl/activation_ocl.hpp index aa492719..ac6767a1 100644 --- a/inference/engine/include/ocl/activation_ocl.hpp +++ b/inference/engine/include/ocl/activation_ocl.hpp @@ -18,7 +18,7 @@ class ActivationOCL : public Activation { public: - ActivationOCL(ActivationParamSpec activationDesc) : Activation(activationDesc) + ActivationOCL(ActivationParamSpec p) : Activation(p) { INIT_GPU_INFO(nullptr) } @@ -28,7 +28,7 @@ class ActivationOCL : public Activation { std::shared_ptr clone() override { std::shared_ptr mem = - std::shared_ptr(new ActivationOCL(this->activationDesc)); + std::shared_ptr(new ActivationOCL(this->p)); *mem = *this; return mem; } @@ -38,7 +38,7 @@ class ActivationOCL : public Activation { OCLContext::getInstance().handle.get()->curOpName = this->get_name(); Tensor inputTensor = this->inputTensors[0]; Tensor outputTensor = this->outputTensors[0]; - CHECK_STATUS(activation(inputTensor, this->activationDesc, outputTensor, &this->archInfo)); + CHECK_STATUS(activation(inputTensor, this->p, outputTensor, &this->archInfo)); } EE infer_output_tensors_size( diff --git a/inference/engine/include/ocl/bilateral_slice_apply_ocl.hpp b/inference/engine/include/ocl/bilateral_slice_apply_ocl.hpp index 4218cc9b..a1d7712c 100644 --- a/inference/engine/include/ocl/bilateral_slice_apply_ocl.hpp +++ b/inference/engine/include/ocl/bilateral_slice_apply_ocl.hpp @@ -41,7 +41,7 @@ class BilateralSliceApplyOCL : public BilateralSliceApply { Tensor gridTensor = this->inputTensors[1]; Tensor outputTensor = this->outputTensors[0]; - if (this->p.mode == BSliceApply_NULL) { + if (this->p.mode == BSLICE_APPLY_NULL) { this->guideTensor = this->inputTensors[2]; } CHECK_STATUS(bilateral_slice_apply( diff --git a/inference/engine/include/ocl/cast_ocl.hpp b/inference/engine/include/ocl/cast_ocl.hpp index 2baa1d54..4d5d5aea 100644 --- a/inference/engine/include/ocl/cast_ocl.hpp +++ b/inference/engine/include/ocl/cast_ocl.hpp @@ -35,16 +35,14 @@ class CastOCL : public Cast { inline void run_prepare() { OCLContext::getInstance().handle.get()->curOpName = this->get_name(); - Tensor inputTensor = this->inputTensors[0]; - Tensor outputTensor = this->outputTensors[0]; - CHECK_STATUS(cast(inputTensor, outputTensor, this->p, &this->archInfo)); + CHECK_STATUS(cast(this->inputTensors[0], this->p, this->outputTensors[0], &this->archInfo)); } EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { this->needSetKernelVec = true; - CHECK_STATUS(cast_infer_output_size(inTensors[0], outTensors[0], this->p, &this->archInfo)); + CHECK_STATUS(cast_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); return SUCCESS; } REGISTER_OCL_OPERATOR_RUN diff --git a/inference/engine/include/ocl/concat_ocl.hpp b/inference/engine/include/ocl/concat_ocl.hpp index d52615c3..a864e0be 100644 --- a/inference/engine/include/ocl/concat_ocl.hpp +++ b/inference/engine/include/ocl/concat_ocl.hpp @@ -73,7 +73,8 @@ class ConcatOCL : public Concat { U32 infer_tmp_memory_size() override { U32 bytes = 0; - CHECK_STATUS(concat_infer_forward_tmp_bytes(this->inputTensors, &bytes, &this->archInfo)); + CHECK_STATUS(concat_infer_forward_tmp_bytes( + this->inputTensors, this->outputTensors[0], &bytes, &this->archInfo)); return bytes; } REGISTER_OCL_OPERATOR_RUN diff --git a/inference/engine/include/ocl/convolution_ocl.hpp b/inference/engine/include/ocl/convolution_ocl.hpp index 9765c7b7..7e702ba9 100644 --- a/inference/engine/include/ocl/convolution_ocl.hpp +++ b/inference/engine/include/ocl/convolution_ocl.hpp @@ -47,7 +47,7 @@ class ConvolutionOCL : public Convolution { U32 filterNum = 1; DataType dtNoQ = (this->dt == DT_F16_8Q) ? DT_F16 : this->dt; switch (this->p.convolution_type) { - case Convolution_Pointwise: { + case CONVOLUTION_POINTWISE: { if (this->p.num_outputs_origin == 1) { if (tensorIs5d(wDesc[0])) { wDesc[0].dims[4] = this->p.num_outputs; @@ -61,13 +61,13 @@ class ConvolutionOCL : public Convolution { CONVOLUTION_ALGORITHM_NULL; break; } - case Convolution_Depthwise: { + case CONVOLUTION_DEPTHWISE: { vDesc[0] = tensor1d(dtNoQ, this->p.num_outputs); ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm = DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; break; } - case Convolution_Depthwise_Pointwise: { + case CONVOLUTION_DEPTHWISE_POINTWISE: { wDesc[1] = this->filterDescExt; vDesc[0] = tensor1d(dtNoQ, this->numChannels); vDesc[1] = tensor1d(dtNoQ, this->p.num_outputs); @@ -76,13 +76,8 @@ class ConvolutionOCL : public Convolution { DEPTHWISE_CONVOLUTION_ALGORITHM_NULL; break; } - case Convolution_Dilation: { - CHECK_STATUS(NOT_SUPPORTED); - return NOT_SUPPORTED; - break; - } default: - CHECK_STATUS(NOT_SUPPORTED); + UNI_ERROR_LOG("not support to read new type convolution's weight.\n"); return NOT_SUPPORTED; } @@ -106,7 +101,7 @@ class ConvolutionOCL : public Convolution { Tensor biasTensor = this->biasTensors[0]; Tensor outputTensor = this->outputTensors[0]; switch (this->p.convolution_type) { - case Convolution_Pointwise: { + case CONVOLUTION_POINTWISE: { Tensor tmpTensor = Tensor(OCLMem); std::vector tmpTensors(3, tmpTensor); tmpTensors[0] = this->temp; @@ -121,15 +116,15 @@ class ConvolutionOCL : public Convolution { &this->archInfo)); break; } - case Convolution_Depthwise: { + case CONVOLUTION_DEPTHWISE: { Tensor tmpTensor = this->temp; get_tmp_image(0, bytes + 1, &tmpTensor); - CHECK_STATUS( - depthwise_convolution(inputTensor, filterTensor, p, this->dwAlg, biasTensor, - tmpTensor, outputTensor, this->dwActivationParamSpec, &this->archInfo)); + CHECK_STATUS(depthwise_convolution(inputTensor, filterTensor, p, this->dwAlg, + nullptr, biasTensor, tmpTensor, outputTensor, this->dwActivationParamSpec, + &this->archInfo)); break; } - case Convolution_Depthwise_Pointwise: { + case CONVOLUTION_DEPTHWISE_POINTWISE: { auto dwFilterTensor = filterTensor; auto pwFilterTensor = this->weightTensors[1]; auto dwBiasTensor = biasTensor; @@ -140,17 +135,13 @@ class ConvolutionOCL : public Convolution { get_tmp_image(0, bytes + 1, &tmpTensors[1]); get_tmp_image(1, bytes + 4, &tmpTensors[2]); CHECK_STATUS(depthwise_pointwise_convolution(this->inputTensors, dwFilterTensor, - pwFilterTensor, p, this->dwAlg, dwBiasTensor, pwBiasTensor, tmpTensors, + pwFilterTensor, p, this->dwAlg, nullptr, dwBiasTensor, pwBiasTensor, tmpTensors, outputTensor, this->dwActivationParamSpec, this->pwActivationParamSpec, &this->archInfo)); break; } - case Convolution_Dilation: { - CHECK_STATUS(NOT_SUPPORTED); - break; - } default: { - UNI_ERROR_LOG("unsupported convolution type %d\n", this->p.convolution_type); + UNI_ERROR_LOG("not support to run new type convolution.\n"); } } } @@ -165,9 +156,10 @@ class ConvolutionOCL : public Convolution { ConvolutionPolicy policy = CONVOLUTION_TUNNING; DataType targetType = DT_F16; I32 algo[7]; - std::string name = this->name + std::to_string(get_type()) + std::to_string(this->p.convolution_type); + std::string name = + this->name + std::to_string(get_type()) + std::to_string(this->p.convolution_type); switch (this->p.convolution_type) { - case Convolution_Pointwise: { + case CONVOLUTION_POINTWISE: { if (this->dt == DT_F16_8Q) { targetType = DT_I8; } @@ -190,7 +182,7 @@ class ConvolutionOCL : public Convolution { } break; } - case Convolution_Depthwise: { + case CONVOLUTION_DEPTHWISE: { if (algorithmMap->getAlgorithmInfoFromMap(name, algo, 4)) { this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; this->runInfo.best_h[0] = algo[1]; @@ -210,7 +202,7 @@ class ConvolutionOCL : public Convolution { } break; } - case Convolution_Depthwise_Pointwise: { + case CONVOLUTION_DEPTHWISE_POINTWISE: { if (algorithmMap->getAlgorithmInfoFromMap(name, algo, 7)) { this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; this->runInfo.best_h[0] = algo[1]; @@ -239,12 +231,9 @@ class ConvolutionOCL : public Convolution { } break; } - case Convolution_Dilation: { - CHECK_STATUS(NOT_SUPPORTED); - break; - } default: - CHECK_STATUS(NOT_SUPPORTED); + UNI_ERROR_LOG("not support to infer new type convolution's algorithm.\n"); + return NOT_SUPPORTED; } return SUCCESS; } @@ -293,11 +282,11 @@ class ConvolutionOCL : public Convolution { } DataType targetType = DT_F16; // Default DT_F16 - if (this->p.convolution_type == Convolution_Dilation) { - this->p.convolution_type = Convolution_Pointwise; + if (this->p.convolution_type == CONVOLUTION_DILATION) { + this->p.convolution_type = CONVOLUTION_POINTWISE; } switch (this->p.convolution_type) { - case Convolution_Pointwise: { + case CONVOLUTION_POINTWISE: { if (tensorIs5d(inDim)) { this->filterDesc = tensor5df(this->dt, DF_NCHW, numFiltersOcl, this->numChannels, this->p.kernel_t, this->p.kernel_h, this->p.kernel_w); @@ -310,7 +299,7 @@ class ConvolutionOCL : public Convolution { inputTensor, filterTensor, p, outTensors[0], targetType, &this->archInfo)); break; } - case Convolution_Depthwise: { + case CONVOLUTION_DEPTHWISE: { this->filterDesc = tensor4df( this->dt, DF_NCHW, 1, this->numChannels, this->p.kernel_h, this->p.kernel_w); filterTensor.resize(this->filterDesc); @@ -318,7 +307,7 @@ class ConvolutionOCL : public Convolution { inputTensor, filterTensor, p, outTensors[0], targetType, &this->archInfo)); break; } - case Convolution_Depthwise_Pointwise: { + case CONVOLUTION_DEPTHWISE_POINTWISE: { this->filterDesc = tensor4df( this->dt, DF_NCHW, 1, this->numChannels, this->p.kernel_h, this->p.kernel_w); this->filterDescExt = @@ -330,12 +319,9 @@ class ConvolutionOCL : public Convolution { filterTensor, filterTensorExt, p, outTensors[0], targetType, &this->archInfo)); break; } - case Convolution_Dilation: { - return NOT_SUPPORTED; - break; - } default: - CHECK_STATUS(NOT_SUPPORTED); + UNI_ERROR_LOG("not support to infer new type convolution's output.\n"); + return NOT_SUPPORTED; } if (use_output_tensor_image(numFiltersOcl, inputTensor)) { CHECK_STATUS(set_tensors_image(outTensors, inTensors.size())); @@ -352,28 +338,24 @@ class ConvolutionOCL : public Convolution { bytes[i] = 0; } switch (this->p.convolution_type) { - case Convolution_Pointwise: { + case CONVOLUTION_POINTWISE: { CHECK_STATUS(convolution_infer_forward_tmp_bytes(inputTensor, filterTensor, outputTensor, p, this->pwAlg, bytes, &this->archInfo)); break; } - case Convolution_Depthwise: { + case CONVOLUTION_DEPTHWISE: { CHECK_STATUS(depthwise_convolution_infer_forward_tmp_bytes(inputTensor, filterTensor, outputTensor, p, this->dwAlg, bytes, &this->archInfo)); break; } - case Convolution_Depthwise_Pointwise: { + case CONVOLUTION_DEPTHWISE_POINTWISE: { CHECK_STATUS(depthwise_pointwise_convolution_infer_forward_tmp_bytes(inputTensor, filterTensor, this->weightTensors[1], outputTensor, p, this->dwAlg, bytes, &this->archInfo)); break; } - case Convolution_Dilation: { - CHECK_STATUS(NOT_SUPPORTED); - break; - } default: - CHECK_STATUS(NOT_SUPPORTED); + UNI_ERROR_LOG("not support to infer new type convolution's tmp memory.\n"); } add_tmp_image(0, bytes + 1); add_tmp_image(1, bytes + 4); @@ -387,22 +369,21 @@ class ConvolutionOCL : public Convolution { U32 biasNum = 0; TensorDesc desc[2]; switch (this->p.convolution_type) { - case Convolution_Pointwise: { + case CONVOLUTION_POINTWISE: { CHECK_STATUS(convolution_transform_filter_bytes( filterTensor, this->p, this->pwAlg, desc, &this->archInfo)); - if (this->runInfo.best_k[0] <= 1 && - this->pwAlg == CONVOLUTION_ALGORITHM_DIRECT) { + if (this->runInfo.best_k[0] <= 1 && this->pwAlg == CONVOLUTION_ALGORITHM_DIRECT) { needTransBiasImgToBuf = true; biasNum = 0; } break; } - case Convolution_Depthwise: { + case CONVOLUTION_DEPTHWISE: { CHECK_STATUS(depthwise_convolution_transform_filter_bytes( filterTensor, this->p, this->dwAlg, desc, &this->archInfo)); break; } - case Convolution_Depthwise_Pointwise: { + case CONVOLUTION_DEPTHWISE_POINTWISE: { CHECK_STATUS(depthwise_pointwise_convolution_transform_filter_bytes(filterTensor, this->weightTensors[1], this->p, this->dwAlg, &desc[0], &desc[1], &this->archInfo)); @@ -415,12 +396,10 @@ class ConvolutionOCL : public Convolution { } break; } - case Convolution_Dilation: { - CHECK_STATUS(NOT_SUPPORTED); - break; - } default: - CHECK_STATUS(NOT_SUPPORTED); + UNI_ERROR_LOG("not support to infer new type convolution's tramsform filter tmp " + "memory.\n"); + return NOT_SUPPORTED; } this->wtm = std::shared_ptr(new Tensor(OCLMem)); this->wtm->resize(desc[0]); @@ -448,38 +427,36 @@ class ConvolutionOCL : public Convolution { EE transform_filter() override { auto filterTensor = this->weightTensors[0]; - if (DT_F16_8Q == this->dt && Convolution_Pointwise == this->p.convolution_type && + if (DT_F16_8Q == this->dt && CONVOLUTION_POINTWISE == this->p.convolution_type && CONVOLUTION_ALGORITHM_WINOGRAD == this->pwAlg) { // int8 winograd return NOT_SUPPORTED; } else if (DT_F16_8Q == this->dt && - Convolution_Pointwise == this->p.convolution_type) { // int8 tilegemm + CONVOLUTION_POINTWISE == this->p.convolution_type) { // int8 tilegemm return NOT_SUPPORTED; } else { // All other cases CHECK_STATUS(alloc_wtm_memory()); switch (this->p.convolution_type) { - case Convolution_Pointwise: { + case CONVOLUTION_POINTWISE: { CHECK_STATUS(convolution_transform_filter(filterTensor, this->p, this->pwAlg, this->temp, this->wtm.get(), &this->archInfo)); break; } - case Convolution_Depthwise: { + case CONVOLUTION_DEPTHWISE: { CHECK_STATUS(depthwise_convolution_transform_filter( filterTensor, this->p, this->dwAlg, this->wtm.get(), &this->archInfo)); break; } - case Convolution_Depthwise_Pointwise: { + case CONVOLUTION_DEPTHWISE_POINTWISE: { CHECK_STATUS(depthwise_pointwise_convolution_transform_filter(filterTensor, this->weightTensors[1], this->p, this->dwAlg, this->wtm.get(), &this->wtm_dp, &this->archInfo)); this->weightTensors[1] = wtm_dp; break; } - case Convolution_Dilation: { - CHECK_STATUS(NOT_SUPPORTED); - break; + default: { + UNI_ERROR_LOG("not support to transform new type convolution's filter.\n"); + return NOT_SUPPORTED; } - default: - CHECK_STATUS(NOT_SUPPORTED); } } this->weightTensors[0] = *this->get_wtm(); diff --git a/inference/engine/include/ocl/copy_ocl.hpp b/inference/engine/include/ocl/copy_ocl.hpp index 7ac82768..c2c2dcea 100644 --- a/inference/engine/include/ocl/copy_ocl.hpp +++ b/inference/engine/include/ocl/copy_ocl.hpp @@ -38,9 +38,7 @@ class CopyOCL : public Copy { TensorDesc srcDesc = this->inputTensors[0].get_desc(); TensorDesc dstDesc = this->inputTensors[1].get_desc(); U32 batch = srcDesc.dims[srcDesc.nDims - 1]; - if (batch > 1) { - CHECK_STATUS(NOT_SUPPORTED); - } + CHECK_REQUIREMENT(batch == 1); U32 copyLength = (this->p.length >= 0) ? this->p.length : tensorNumElements(srcDesc) / batch; U32 srcStride = (this->p.src_dims[0] >= 0) ? this->p.src_dims[1] : tensorNumElements(srcDesc) / batch; diff --git a/inference/engine/include/ocl/eltwise_ocl.hpp b/inference/engine/include/ocl/eltwise_ocl.hpp index ca9be6ff..3233866c 100644 --- a/inference/engine/include/ocl/eltwise_ocl.hpp +++ b/inference/engine/include/ocl/eltwise_ocl.hpp @@ -18,7 +18,7 @@ class EltwiseOCL : public Eltwise { public: - EltwiseOCL(EltwiseParamSpec eltwiseDesc) : Eltwise(eltwiseDesc) + EltwiseOCL(EltwiseParamSpec p) : Eltwise(p) { INIT_GPU_INFO(nullptr) } @@ -27,8 +27,7 @@ class EltwiseOCL : public Eltwise { std::shared_ptr clone() override { - std::shared_ptr mem = - std::shared_ptr(new EltwiseOCL(this->eltwiseDesc)); + std::shared_ptr mem = std::shared_ptr(new EltwiseOCL(this->p)); *mem = *this; return mem; } @@ -36,8 +35,8 @@ class EltwiseOCL : public Eltwise { inline void run_prepare() { OCLContext::getInstance().handle.get()->curOpName = this->get_name(); - CHECK_STATUS(eltwise(this->inputTensors, this->eltwiseDesc, this->temp, - this->outputTensors[0], &this->archInfo)); + CHECK_STATUS(eltwise( + this->inputTensors, this->p, this->temp, this->outputTensors[0], &this->archInfo)); } EE infer_output_tensors_size( diff --git a/inference/engine/include/ocl/embedding_ocl.hpp b/inference/engine/include/ocl/embedding_ocl.hpp index a664c637..7e31960c 100644 --- a/inference/engine/include/ocl/embedding_ocl.hpp +++ b/inference/engine/include/ocl/embedding_ocl.hpp @@ -52,20 +52,16 @@ class EmbeddingOCL : public Embedding { std::vector inTensors, std::vector outTensors) override { this->needSetKernelVec = true; - if (this->p.num_output <= 0) { - if (inTensors.size() <= 1) { - CHECK_STATUS(NOT_SUPPORTED); - } + if (this->p.num_outputs <= 0) { + CHECK_REQUIREMENT(inTensors.size() > 1); TensorDesc desc = inTensors[1]->get_desc(); - if (desc.nDims != 2) { - CHECK_STATUS(NOT_MATCH); - } + CHECK_REQUIREMENT(desc.nDims == 2); if (this->p.transpose) { - this->p.input_dim = desc.dims[0]; - this->p.num_output = desc.dims[1]; + this->p.num_inputs = desc.dims[0]; + this->p.num_outputs = desc.dims[1]; } else { - this->p.input_dim = desc.dims[1]; - this->p.num_output = desc.dims[0]; + this->p.num_inputs = desc.dims[1]; + this->p.num_outputs = desc.dims[0]; } } CHECK_STATUS(embedding_infer_output_size( @@ -81,9 +77,9 @@ class EmbeddingOCL : public Embedding { } TensorDesc weightDesc; if (this->p.transpose) { - weightDesc = tensor2df(this->dt, DF_TRANSPOSE, this->p.num_output, this->p.input_dim); + weightDesc = tensor2df(this->dt, DF_TRANSPOSE, this->p.num_outputs, this->p.num_inputs); } else { - weightDesc = tensor2df(this->dt, DF_NORMAL, this->p.input_dim, this->p.num_output); + weightDesc = tensor2df(this->dt, DF_NORMAL, this->p.num_inputs, this->p.num_outputs); } Tensor modelWeightTensor = Tensor(OCLMem); modelWeightTensor.resize(weightDesc); diff --git a/inference/engine/include/ocl/factory_ocl.hpp b/inference/engine/include/ocl/factory_ocl.hpp index 7f401dc4..ea4ccebf 100644 --- a/inference/engine/include/ocl/factory_ocl.hpp +++ b/inference/engine/include/ocl/factory_ocl.hpp @@ -156,9 +156,10 @@ class FactoryOCL : public Factory { return std::shared_ptr(cep); } - std::shared_ptr createLayerNorm(DataType dt, U32 weightNum) override + std::shared_ptr createLayerNorm( + DataType dt, LayerNormParamSpec p, U32 weightNum) override { - auto cep = (LayerNorm *)new LayerNormOCL(dt, weightNum); + auto cep = (LayerNorm *)new LayerNormOCL(dt, p, weightNum); return std::shared_ptr(cep); } @@ -168,11 +169,9 @@ class FactoryOCL : public Factory { return std::shared_ptr(cep); } - std::shared_ptr createResize(DataType paramDT, ResizeParamSpec p) override + std::shared_ptr createResize(DataType dt, ResizeParamSpec p) override { - // auto cep = new Resize(paramDT, paramPtr); - // OP_UNSUP(2, paramDT, paramPtr); - auto cep = (Resize *)(new ResizeOCL(paramDT, p)); + auto cep = (Resize *)(new ResizeOCL(dt, p)); return std::shared_ptr(cep); } @@ -250,9 +249,9 @@ class FactoryOCL : public Factory { return std::shared_ptr(cep); } - std::shared_ptr createPreAllocatedMemory(DataType dt, TensorDesc desc) override + std::shared_ptr createPreAllocatedMemory(PreAllocatedMemoryParamSpec p) override { - auto cep = (PreAllocatedMemory *)new PreAllocatedMemoryOCL(dt, desc); + auto cep = (PreAllocatedMemory *)new PreAllocatedMemoryOCL(p); return std::shared_ptr(cep); } @@ -393,12 +392,6 @@ class FactoryOCL : public Factory { return std::shared_ptr(cep); } - std::shared_ptr createEqual(DataType dt, EqualParamSpec p) override - { - OP_UNSUP(2, dt, p); - return std::shared_ptr(cep); - } - std::shared_ptr createInstanceNorm(DataType dt, InstanceNormParamSpec p) override { OP_UNSUP(2, dt, p); @@ -435,9 +428,9 @@ class FactoryOCL : public Factory { return std::shared_ptr(cep); } - std::shared_ptr createRoIAlign(RoIAlignParamSpec p) override + std::shared_ptr createRoIAlign(DataType dt, RoIAlignParamSpec p) override { - auto cep = (RoIAlign *)new RoIAlignOCL(p); + auto cep = (RoIAlign *)new RoIAlignOCL(dt, p); return std::shared_ptr(cep); } @@ -453,5 +446,53 @@ class FactoryOCL : public Factory { OP_UNSUP(2, dt, p); return std::shared_ptr(cep); } + std::shared_ptr createQuantizeLinear(DataType dt, QuantizeLinearParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createGridSample(DataType dt, GridSampleParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createOneHot(DataType dt, OneHotParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createCumSum(DataType dt, CumSumParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createNonMaxSuppression( + DataType dt, NonMaxSuppressionParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createConstantOfShape(DataType dt, ConstantOfShapeParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } + + std::shared_ptr createNonZero(DataType dt) override + { + OP_UNSUP(1, dt); + return std::shared_ptr(cep); + } + + std::shared_ptr createRange(DataType dt, RangeParamSpec p) override + { + OP_UNSUP(2, dt, p); + return std::shared_ptr(cep); + } }; #endif // _FACTORY_OCL_H diff --git a/inference/engine/include/ocl/fully_connected_ocl.hpp b/inference/engine/include/ocl/fully_connected_ocl.hpp index 1f832e60..b621ead0 100644 --- a/inference/engine/include/ocl/fully_connected_ocl.hpp +++ b/inference/engine/include/ocl/fully_connected_ocl.hpp @@ -131,9 +131,7 @@ class FullyConnectedOCL : public FullyConnected { auto biasMem = (OclMemory *)inTensors[1]->get_memory(); biasMem->padding(0, 8, 0, 0); } - if (this->p.num_slices > 1) { - CHECK_STATUS(NOT_SUPPORTED); - } + CHECK_REQUIREMENT(this->p.num_slices == 1); return SUCCESS; } @@ -174,11 +172,8 @@ class FullyConnectedOCL : public FullyConnected { { Tensor inputTensor = this->inputTensors[0]; Tensor filterTensor = this->weightTensors[0]; - if (this->p.num_slices == 1) { - CHECK_STATUS(alloc_wtm_memory()); - } else { - CHECK_STATUS(NOT_SUPPORTED); - } + CHECK_REQUIREMENT(this->p.num_slices == 1); + CHECK_STATUS(alloc_wtm_memory()); CHECK_STATUS(fully_connected_transform_filter( inputTensor, filterTensor, this->wtm.get(), &this->archInfo)); this->weightTensors[0] = *this->get_wtm(); diff --git a/inference/engine/include/ocl/layer_norm_ocl.hpp b/inference/engine/include/ocl/layer_norm_ocl.hpp index 4b3e8cd8..aefb73ba 100644 --- a/inference/engine/include/ocl/layer_norm_ocl.hpp +++ b/inference/engine/include/ocl/layer_norm_ocl.hpp @@ -18,7 +18,7 @@ class LayerNormOCL : public LayerNorm { public: - LayerNormOCL(DataType dt, U32 weightNum) : LayerNorm(dt, weightNum) + LayerNormOCL(DataType dt, LayerNormParamSpec p, U32 weightNum) : LayerNorm(dt, p, weightNum) { INIT_GPU_INFO(nullptr) } @@ -28,7 +28,7 @@ class LayerNormOCL : public LayerNorm { std::shared_ptr clone() override { std::shared_ptr mem = - std::shared_ptr(new LayerNormOCL(this->dt, this->weightNum)); + std::shared_ptr(new LayerNormOCL(this->dt, this->p, this->weightNum)); *mem = *this; return mem; } @@ -66,8 +66,8 @@ class LayerNormOCL : public LayerNorm { Tensor weightTensor = this->weightTensors[0]; Tensor biasTensor = this->biasTensors[0]; Tensor outputTensor = this->outputTensors[0]; - CHECK_STATUS(layer_normalization( - inputTensor, weightTensor, biasTensor, this->temp, outputTensor, &this->archInfo)); + CHECK_STATUS(layer_normalization(inputTensor, this->p, weightTensor, biasTensor, this->temp, + outputTensor, &this->archInfo)); } EE infer_output_tensors_size( diff --git a/inference/engine/include/ocl/matmul_ocl.hpp b/inference/engine/include/ocl/matmul_ocl.hpp index f1407b73..bd076d1b 100644 --- a/inference/engine/include/ocl/matmul_ocl.hpp +++ b/inference/engine/include/ocl/matmul_ocl.hpp @@ -84,9 +84,7 @@ class MatMulOCL : public MatMul { std::vector inTensors, std::vector outTensors) override { this->needSetKernelVec = true; - if (inTensors.size() > 2) { - CHECK_STATUS(NOT_SUPPORTED); - } + CHECK_REQUIREMENT(inTensors.size() == 2); CHECK_STATUS(matmul_infer_output_size(inTensors[0], this->p.transpose_a, inTensors[1], this->p.transpose_b, outTensors[0], &this->archInfo)); if (check_tensors_image(inTensors)) { diff --git a/inference/engine/include/ocl/power_ocl.hpp b/inference/engine/include/ocl/power_ocl.hpp index 61851551..d643d17d 100644 --- a/inference/engine/include/ocl/power_ocl.hpp +++ b/inference/engine/include/ocl/power_ocl.hpp @@ -45,7 +45,7 @@ class PowerOCL : public Power { std::vector inTensors, std::vector outTensors) override { this->needSetKernelVec = true; - CHECK_STATUS(power_infer_output_size(inTensors[0], outTensors[0], &this->archInfo)); + CHECK_STATUS(power_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); if (check_tensors_image(inTensors) && inTensors[0] != outTensors[0]) { CHECK_STATUS(set_tensors_image(outTensors, inTensors.size())); } diff --git a/inference/engine/include/ocl/preallocated_memory_ocl.hpp b/inference/engine/include/ocl/preallocated_memory_ocl.hpp index 56b28b78..e840d310 100644 --- a/inference/engine/include/ocl/preallocated_memory_ocl.hpp +++ b/inference/engine/include/ocl/preallocated_memory_ocl.hpp @@ -18,7 +18,7 @@ class PreAllocatedMemoryOCL : public PreAllocatedMemory { public: - PreAllocatedMemoryOCL(DataType dt, TensorDesc desc) : PreAllocatedMemory(dt, desc) + PreAllocatedMemoryOCL(PreAllocatedMemoryParamSpec p) : PreAllocatedMemory(p) { INIT_GPU_INFO(nullptr) } @@ -28,7 +28,7 @@ class PreAllocatedMemoryOCL : public PreAllocatedMemory { std::shared_ptr clone() override { std::shared_ptr mem = - std::shared_ptr(new PreAllocatedMemoryOCL(this->dt, this->desc)); + std::shared_ptr(new PreAllocatedMemoryOCL(this->p)); *mem = *this; return mem; } @@ -36,19 +36,15 @@ class PreAllocatedMemoryOCL : public PreAllocatedMemory { inline void run_prepare() { OCLContext::getInstance().handle.get()->curOpName = this->get_name(); - CHECK_STATUS(preallocated_memory(this->outputTensors[0], &this->archInfo)); + CHECK_STATUS(preallocated_memory(this->p, this->outputTensors[0], &this->archInfo)); } EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { this->needSetKernelVec = true; - if (inTensors.size() > 0) { - CHECK_STATUS(NOT_MATCH); - } - outTensors[0]->resize(this->desc); - CHECK_STATUS(preallocated_memory_infer_output_size(outTensors[0], &this->archInfo)); - return SUCCESS; + return preallocated_memory_infer_output_size( + inTensors, this->p, outTensors[0], &this->archInfo); } REGISTER_OCL_OPERATOR_RUN diff --git a/inference/engine/include/ocl/prelu_ocl.hpp b/inference/engine/include/ocl/prelu_ocl.hpp index 5ddd26ff..f2708611 100644 --- a/inference/engine/include/ocl/prelu_ocl.hpp +++ b/inference/engine/include/ocl/prelu_ocl.hpp @@ -39,13 +39,11 @@ class PReLUOCL : public PReLU { if (curOpWs.weight != nullptr) { weightNum = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); } - if (weightNum == 0) { - CHECK_STATUS(NOT_SUPPORTED); - } + CHECK_REQUIREMENT(weightNum != 0); if (weightNum == 1) { - this->preluDesc.propagate_down = true; + this->p.propagate_down = true; } else { - this->preluDesc.propagate_down = false; + this->p.propagate_down = false; } Tensor modelWeightTensor = Tensor(OCLMem); TensorDesc weightDesc = tensor1d(this->dt, weightNum); @@ -57,7 +55,7 @@ class PReLUOCL : public PReLU { inline void run_prepare() { OCLContext::getInstance().handle.get()->curOpName = this->get_name(); - CHECK_STATUS(prelu(this->inputTensors[0], this->weightTensors[0], this->preluDesc, + CHECK_STATUS(prelu(this->inputTensors[0], this->weightTensors[0], this->p, this->outputTensors[0], &this->archInfo)); } diff --git a/inference/engine/include/ocl/repeat_ocl.hpp b/inference/engine/include/ocl/repeat_ocl.hpp index 32a7efbc..e79baeaf 100644 --- a/inference/engine/include/ocl/repeat_ocl.hpp +++ b/inference/engine/include/ocl/repeat_ocl.hpp @@ -45,18 +45,10 @@ class RepeatOCL : public Repeat { if (this->inputTensors.size() > 1) { Tensor inputTensor = this->inputTensors[1]; TensorDesc inputDesc = inputTensor.get_desc(); - GCLMem_t ptr = (GCLMem_t)(((OclMemory *)(inputTensor.get_memory()))->get_ptr()); U32 length = tensorNumElements(inputDesc); - DataFormat df = ptr->desc.memFormat; - if (df != DF_NCHW) { - CHECK_STATUS(NOT_SUPPORTED); - } - U32 w_off, h_off; - w_off = ptr->desc.offset[0]; - h_off = ptr->desc.offset[1]; - if (w_off != 0 || h_off != 0) { - CHECK_STATUS(NOT_SUPPORTED); - } + GCLMem_t ptr = (GCLMem_t)(((OclMemory *)(inputTensor.get_memory()))->get_ptr()); + CHECK_REQUIREMENT(ptr->desc.memFormat == DF_NCHW); + CHECK_REQUIREMENT(ptr->desc.offset[0] == 0 && ptr->desc.offset[1] == 0); I32 *val = hostVal.get(); CHECK_STATUS(gcl_trans_memory(OCLContext::getInstance().handle.get(), ptr, val, &length, DEVICE_BUF_TO_HOST, CL_TRUE)); diff --git a/inference/engine/include/ocl/resize_ocl.hpp b/inference/engine/include/ocl/resize_ocl.hpp index b55ed05d..021174bc 100644 --- a/inference/engine/include/ocl/resize_ocl.hpp +++ b/inference/engine/include/ocl/resize_ocl.hpp @@ -19,7 +19,7 @@ class ResizeOCL : public Resize { public: - ResizeOCL(DataType paramDT, ResizeParamSpec p) : Resize(paramDT, p) + ResizeOCL(DataType dt, ResizeParamSpec p) : Resize(dt, p) { INIT_GPU_INFO(nullptr) } @@ -29,7 +29,7 @@ class ResizeOCL : public Resize { std::shared_ptr clone() override { std::shared_ptr mem = - std::shared_ptr(new ResizeOCL(this->paramDT, this->p)); + std::shared_ptr(new ResizeOCL(this->dt, this->p)); *mem = *this; return mem; } @@ -39,31 +39,16 @@ class ResizeOCL : public Resize { OCLContext::getInstance().handle.get()->curOpName = this->get_name(); Tensor inputTensor = this->inputTensors[0]; Tensor outputTensor = this->outputTensors[0]; - CHECK_STATUS(resize(inputTensor, this->temp, outputTensor, this->p, &this->archInfo)); + CHECK_STATUS(resize(inputTensor, this->p, this->temp, outputTensor, &this->archInfo)); } EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { this->needSetKernelVec = true; + CHECK_STATUS( + resize_infer_output_size(inTensors[0], this->p, outTensors[0], &this->archInfo)); TensorDesc desc = inTensors[0]->get_desc(); - U32 bytes; - switch (paramDT) { - case DT_F32: { - CHECK_REQUIREMENT(1 == this->p.scales[0] && 1 == this->p.scales[1]); - CHECK_STATUS(resize_infer_output_size(inTensors[0], this->paramDT, - this->p.scales + 2, outTensors[0], &bytes, &this->archInfo)); - break; - } - case DT_U32: { - CHECK_STATUS(resize_infer_output_size(inTensors[0], this->paramDT, this->p.sizes, - outTensors[0], &bytes, &this->archInfo)); - break; - } - default: { - CHECK_STATUS(NOT_SUPPORTED); - } - } if (desc.df == DF_NCHWC4 && check_tensors_image(inTensors)) { CHECK_STATUS(set_tensors_image(outTensors, inTensors.size())); } @@ -72,12 +57,10 @@ class ResizeOCL : public Resize { U32 infer_tmp_memory_size() override { - U32 size = 0; - TensorDesc inputDesc = inputTensors[0].get_desc(); - if (inputDesc.df == DF_NCHW && inputTensors[0].get_mem_type() != OCLMem) { - size = tensorNumBytes(inputDesc); - } - return size; + U32 bytes = 0; + CHECK_STATUS(resize_infer_forward_tmp_bytes( + this->inputTensors[0], this->p, this->outputTensors[0], &bytes, &this->archInfo)); + return bytes; } REGISTER_OCL_OPERATOR_RUN diff --git a/inference/engine/include/ocl/rnn_ocl.hpp b/inference/engine/include/ocl/rnn_ocl.hpp index deb78b70..0c919761 100644 --- a/inference/engine/include/ocl/rnn_ocl.hpp +++ b/inference/engine/include/ocl/rnn_ocl.hpp @@ -47,8 +47,8 @@ class RNNOCL : public RNNCellOCL { ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm = CONVOLUTION_ALGORITHM_NULL; I32 algo[10]; - U32 algoNum = (this->p.numProjection > 0) ? 10 : 7; - std::string name = this->name + std::to_string(get_type()); + U32 algoNum = (this->p.num_projection > 0) ? 10 : 7; + std::string name = this->name + std::to_string(get_type()); if (algorithmMap->getAlgorithmInfoFromMap(name, algo, algoNum)) { this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; this->runInfo.best_h[0] = algo[1]; @@ -119,13 +119,13 @@ class RNNOCL : public RNNCellOCL { this->wtm_gemv = std::shared_ptr(new Tensor(this->wtmType)); this->wtm_gemv->resize(ftmDesc[1]); this->wtm_gemv->alloc(); - if (this->p.numProjection > 0) { + if (this->p.num_projection > 0) { this->wtm_pro = std::shared_ptr(new Tensor(this->wtmType)); this->wtm_pro->resize(ftmDesc[2]); this->wtm_pro->alloc(); } - if (this->p.biDirection) { + if (this->p.bi_direction) { this->wtm_bi = std::shared_ptr(new Tensor(this->wtmType)); this->wtm_bi->resize(ftmDesc[0]); CHECK_STATUS(set_wtm_image(ftmDesc[0], &wtm_bi)); @@ -133,7 +133,7 @@ class RNNOCL : public RNNCellOCL { this->wtm_gemv_bi = std::shared_ptr(new Tensor(this->wtmType)); this->wtm_gemv_bi->resize(ftmDesc[1]); this->wtm_gemv_bi->alloc(); - if (this->p.numProjection > 0) { + if (this->p.num_projection > 0) { this->wtm_pro_bi = std::shared_ptr(new Tensor(this->wtmType)); this->wtm_pro_bi->resize(ftmDesc[2]); this->wtm_pro_bi->alloc(); @@ -147,8 +147,8 @@ class RNNOCL : public RNNCellOCL { CHECK_STATUS(alloc_wtm_memory()); std::vector filterTensors; std::vector ftmTensors; - U32 weightNum = (this->p.numProjection > 0) ? 2 : 1; - U32 directions = (this->p.biDirection) ? 2 : 1; + U32 weightNum = (this->p.num_projection > 0) ? 2 : 1; + U32 directions = (this->p.bi_direction) ? 2 : 1; for (U32 i = 0; i < directions; i++) { for (U32 j = 0; j < weightNum; j++) { filterTensors.push_back(this->weightTensors[i * weightNum + j]); @@ -157,13 +157,13 @@ class RNNOCL : public RNNCellOCL { ftmTensors.push_back(this->wtm.get()); ftmTensors.push_back(this->wtm_gemv.get()); - if (this->p.numProjection > 0) { + if (this->p.num_projection > 0) { ftmTensors.push_back(this->wtm_pro.get()); } - if (this->p.biDirection) { + if (this->p.bi_direction) { ftmTensors.push_back(this->wtm_bi.get()); ftmTensors.push_back(this->wtm_gemv_bi.get()); - if (this->p.numProjection > 0) { + if (this->p.num_projection > 0) { ftmTensors.push_back(this->wtm_pro_bi.get()); } } @@ -178,16 +178,16 @@ class RNNOCL : public RNNCellOCL { weightNumCount++; this->weightTensors[weightNumCount] = *this->wtm_gemv.get(); weightNumCount++; - if (this->p.numProjection > 0) { + if (this->p.num_projection > 0) { this->weightTensors[weightNumCount] = (*this->wtm_pro.get()); weightNumCount++; } - if (this->p.biDirection) { + if (this->p.bi_direction) { this->weightTensors[weightNumCount] = *this->wtm_bi.get(); weightNumCount++; this->weightTensors[weightNumCount] = *this->wtm_gemv_bi.get(); weightNumCount++; - if (this->p.numProjection > 0) { + if (this->p.num_projection > 0) { this->weightTensors[weightNumCount] = (*this->wtm_pro_bi.get()); weightNumCount++; } diff --git a/inference/engine/include/ocl/rnncell_ocl.hpp b/inference/engine/include/ocl/rnncell_ocl.hpp index b79b0eef..6a1170e9 100644 --- a/inference/engine/include/ocl/rnncell_ocl.hpp +++ b/inference/engine/include/ocl/rnncell_ocl.hpp @@ -41,13 +41,13 @@ class RNNCellOCL : public RNNCell { Tensor hTensor = this->outputTensors[0]; CHECK_STATUS(rnncell(xTensor, this->weightTensors, this->biasTensors, stateTensor, this->p, - this->xDim, this->p.numOutput, 0, this->temp, hTensor, &this->archInfo)); + this->xDim, this->p.num_outputs, 0, this->temp, hTensor, &this->archInfo)); } EE infer_forward_algorithm(std::shared_ptr algorithmMap) override { - if (this->p.biDirection) { - CHECK_STATUS(NOT_SUPPORTED); + if (this->p.bi_direction) { + UNI_ERROR_LOG("gpu not support bi-direction rnn.\n"); } OCLContext::getInstance().handle.get()->kernelVec = &this->opKernelVec; Tensor xTensor = this->inputTensors[0]; @@ -58,8 +58,8 @@ class RNNCellOCL : public RNNCell { ((MaliPara_t)(this->archInfo.archPara))->forwardRunInfo->algorithm = CONVOLUTION_ALGORITHM_NULL; I32 algo[7]; - U32 algoNum = (this->p.numProjection > 0) ? 7 : 4; - std::string name = this->name + std::to_string(get_type()); + U32 algoNum = (this->p.num_projection > 0) ? 7 : 4; + std::string name = this->name + std::to_string(get_type()); if (algorithmMap->getAlgorithmInfoFromMap(name, algo, algoNum)) { this->runInfo.algorithm = (ConvolutionForwardAlgorithm)algo[0]; this->runInfo.best_h[0] = algo[1]; @@ -72,7 +72,7 @@ class RNNCellOCL : public RNNCell { } } else { CHECK_STATUS(rnncell_infer_forward_algorithm(xTensor, filterTensor, biasTensor, - stateTensor, this->p, this->xDim, this->p.numOutput, hTensor, &this->archInfo)); + stateTensor, this->p, this->xDim, this->p.num_outputs, hTensor, &this->archInfo)); algo[0] = this->runInfo.algorithm; algo[1] = this->runInfo.best_h[0]; algo[2] = this->runInfo.best_c[0]; @@ -118,7 +118,7 @@ class RNNCellOCL : public RNNCell { this->wtm = std::shared_ptr(new Tensor(this->wtmType)); this->wtm->resize(ftmDesc[0]); this->wtm->alloc(); - if (this->p.numProjection > 0) { + if (this->p.num_projection > 0) { this->wtm_pro = std::shared_ptr(new Tensor(this->wtmType)); this->wtm_pro->resize(ftmDesc[1]); this->wtm_pro->alloc(); @@ -133,13 +133,13 @@ class RNNCellOCL : public RNNCell { std::vector ftmTensors; filterTensors.push_back(this->weightTensors[0]); ftmTensors.push_back(this->wtm.get()); - if (this->p.numProjection > 0) { + if (this->p.num_projection > 0) { filterTensors.push_back(this->weightTensors[1]); ftmTensors.push_back(this->wtm_pro.get()); } CHECK_STATUS(rnncell_transform_filter(filterTensors, this->p, ftmTensors, &this->archInfo)); this->weightTensors[0] = *this->get_wtm(); - if (this->p.numProjection > 0) { + if (this->p.num_projection > 0) { this->weightTensors[1] = *wtm_pro.get(); } return SUCCESS; @@ -147,20 +147,20 @@ class RNNCellOCL : public RNNCell { EE infer_weight_desc() override { - U32 column = (this->p.numProjection > 0) ? this->p.numProjection : this->p.numOutput; + U32 column = (this->p.num_projection > 0) ? this->p.num_projection : this->p.num_outputs; U32 filterRow = 4 * column; - U32 filterCol = this->p.numOutput + this->xDim; + U32 filterCol = this->p.num_outputs + this->xDim; TensorDesc weightDesc[2]; TensorDesc biasDesc[2]; weightDesc[0] = tensor2df(this->dt, DF_NK, filterRow, filterCol); - weightDesc[1] = tensor2df(this->dt, DF_NK, this->p.numOutput, this->p.numProjection); + weightDesc[1] = tensor2df(this->dt, DF_NK, this->p.num_outputs, this->p.num_projection); biasDesc[0] = tensor1d(this->dt, filterRow); - biasDesc[1] = tensor1d(this->dt, this->p.numOutput); - U32 weightNum = (this->p.numProjection > 0) ? 2 : 1; + biasDesc[1] = tensor1d(this->dt, this->p.num_outputs); + U32 weightNum = (this->p.num_projection > 0) ? 2 : 1; U32 biasNum = weightNum; - U32 diretions = (this->p.biDirection) ? 2 : 1; + U32 diretions = (this->p.bi_direction) ? 2 : 1; if (this->p.mode != RNN_LSTM) { - CHECK_STATUS(NOT_SUPPORTED); + UNI_ERROR_LOG("gpu rnn only support lstm.\n"); } for (U32 d = 0; d < diretions; d++) { diff --git a/inference/engine/include/ocl/roialign_ocl.hpp b/inference/engine/include/ocl/roialign_ocl.hpp index 744fb615..a0bfd60e 100644 --- a/inference/engine/include/ocl/roialign_ocl.hpp +++ b/inference/engine/include/ocl/roialign_ocl.hpp @@ -18,7 +18,7 @@ class RoIAlignOCL : public RoIAlign { public: - RoIAlignOCL(RoIAlignParamSpec p) : RoIAlign(p) + RoIAlignOCL(DataType dt, RoIAlignParamSpec p) : RoIAlign(dt, p) { INIT_GPU_INFO(nullptr) } @@ -27,7 +27,8 @@ class RoIAlignOCL : public RoIAlign { std::shared_ptr clone() override { - std::shared_ptr mem = std::shared_ptr(new RoIAlignOCL(this->p)); + std::shared_ptr mem = + std::shared_ptr(new RoIAlignOCL(this->dt, this->p)); *mem = *this; return mem; } diff --git a/inference/engine/include/ocl/scale_ocl.hpp b/inference/engine/include/ocl/scale_ocl.hpp index 1b86a991..f6df3a22 100644 --- a/inference/engine/include/ocl/scale_ocl.hpp +++ b/inference/engine/include/ocl/scale_ocl.hpp @@ -28,7 +28,7 @@ class ScaleOCL : public Scale { std::shared_ptr clone() override { std::shared_ptr mem = - std::shared_ptr(new ScaleOCL(this->dt, this->p, this->numChannels)); + std::shared_ptr(new ScaleOCL(this->dt, this->p, 0)); *mem = *this; return mem; } @@ -38,7 +38,6 @@ class ScaleOCL : public Scale { auto curOpWs = this->get_weightspec(); U32 weightNum = 0; U32 vecNum = 0; - this->numChannels = 0; if (0 != curOpWs.bytes_of_weight) { weightNum = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); } @@ -72,10 +71,7 @@ class ScaleOCL : public Scale { int inputNum = this->inputTensors.size(); Tensor inputTensor = this->inputTensors[this->dataID]; Tensor outputTensor = this->outputTensors[0]; - if (inputNum == 1 && weightTensors.size() == 0 && biasTensors.size() == 0) { - CHECK_STATUS(NOT_MATCH); - } - + CHECK_REQUIREMENT(inputNum != 1 || weightTensors.size() != 0 || biasTensors.size() != 0); if (inputNum > 1) { U32 cNum = this->inputTensors[this->dataID].get_desc().dims[2]; for (int i = 0; i < inputNum; i++) { @@ -92,8 +88,10 @@ class ScaleOCL : public Scale { desc.offset[1] == 0) { continue; } + } else { + UNI_ERROR_LOG("gpu scale not support %s format input.\n", + DataFormatName()[desc.memFormat]); } - CHECK_STATUS(NOT_MATCH); } } } diff --git a/inference/engine/include/onehot.hpp b/inference/engine/include/onehot.hpp new file mode 100644 index 00000000..2e0876a9 --- /dev/null +++ b/inference/engine/include/onehot.hpp @@ -0,0 +1,35 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _ONEHOT_H +#define _ONEHOT_H + +#include "operator.hpp" + +class OneHot : public Operator { +public: + explicit OneHot(DataType dt, OneHotParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_OneHot; + } + +protected: + OneHotParamSpec p; +}; +#endif // _ONEHOT_H diff --git a/inference/engine/include/operator.hpp b/inference/engine/include/operator.hpp index 5426ac73..810dac88 100644 --- a/inference/engine/include/operator.hpp +++ b/inference/engine/include/operator.hpp @@ -23,7 +23,7 @@ #include "gcl_engine.h" #include "image_container.hpp" #endif -#include "parameter_spec.h" +#include "tensor_computing.h" class Operator { public: @@ -155,7 +155,7 @@ class Operator { featureScale.resize(num); for (U32 i = 0; i < num; i++) { featureScale[i].resize(qs[i].num_scale); - memcpy(featureScale[i].data(), qs[i].scale, qs[i].num_scale * bytesOf(DT_F32)); + UNI_MEMCPY(featureScale[i].data(), qs[i].scale, qs[i].num_scale * bytesOf(DT_F32)); } #endif } @@ -251,7 +251,7 @@ class Operator { if (size[0] == 0 && size[1] == 0 && size[2] == 0) { return false; } else if (size[0] == 0 || size[1] == 0 || size[2] == 0) { - CHECK_STATUS(NOT_MATCH); + UNI_ERROR_LOG("gpu tmp buffer(on image buffer) parameter is wrong.\n"); } *tensor = this->tempImages->get(slot, size[0], size[1], size[2]); findMatchImage = true; @@ -302,6 +302,25 @@ class Operator { } #endif + int is_shape(std::vector tensors) + { + int count = 0; + for (U32 i = 0; i < tensors.size(); i++) { + count += tensorIsShape(tensors[i]->get_desc()); + } + return count; + } + + TensorDesc tensor_shape(Tensor tensor) + { + TensorDesc desc = tensor.get_desc(); + U32 *ptr = (U32 *)((CpuMemory *)(tensor.get_memory()))->get_ptr(); + for (U32 i = 0; i < tensor.length() && desc.nDims + i < DIM_LEN; i++) { + desc.dims[desc.nDims + i] = ptr[i]; + } + return desc; + } + protected: ArchInfo archInfo; DataType dt; diff --git a/inference/engine/include/parse_command.h b/inference/engine/include/parse_command.h index 81db5c4d..fb3ef113 100644 --- a/inference/engine/include/parse_command.h +++ b/inference/engine/include/parse_command.h @@ -20,19 +20,12 @@ #include "error.h" #ifdef _USE_FP16 - -inline U32 getBinFileSize(CI8 *dataPath, CI8 *dataName) +inline U32 getBinFileSize(CI8 *directory, CI8 *name) { - std::string filePath = dataPath; - CI8 lastFlag = filePath[filePath.length() - 1]; - if (strcmp(&lastFlag, "/") != 0) { - filePath += "/"; - } - std::string fileName = dataName; - fileName = filePath + fileName; - FILE *file = fopen(fileName.c_str(), "rb"); + std::string path = std::string(directory) + std::string("/") + std::string(name); + FILE *file = fopen(path.c_str(), "rb"); if (file == NULL) { - UNI_WARNING_LOG("can not get %s file size.\n", fileName.c_str()); + UNI_ERROR_LOG("can not get %s file size.\n", path.c_str()); return 0; } fseek(file, 0, SEEK_END); @@ -42,50 +35,34 @@ inline U32 getBinFileSize(CI8 *dataPath, CI8 *dataName) return size; } -inline void writeF16ToF32Bin(F16 *data, U32 num, CI8 *dataPath, CI8 *dataName) +inline void writeF16ToF32Bin(F16 *data, U32 num, CI8 *directory, CI8 *name) { - std::string filePath = dataPath; - CI8 lastFlag = filePath[filePath.length() - 1]; - if (strcmp(&lastFlag, "/") != 0) { - filePath += "/"; - } - std::string fileName = dataName; - fileName = filePath + fileName; - FILE *outfile = fopen(fileName.c_str(), "wb"); - if (outfile == NULL) { - UNI_WARNING_LOG("can not write %s.\n", fileName.c_str()); + std::string path = std::string(directory) + std::string("/") + std::string(name); + FILE *file = fopen(path.c_str(), "wb"); + if (file == NULL) { + UNI_ERROR_LOG("can not write %s.\n", path.c_str()); return; } - F32 *dataTran = new F32[num]; - for (U32 i = 0; i < num; i++) { - dataTran[i] = (F32)data[i]; - } - fwrite(dataTran, sizeof(float), num, outfile); - fclose(outfile); - delete[] dataTran; + float *buffer = (float *)UNI_MALLOC(sizeof(float) * num); + transformToFloat(DT_F16, data, buffer, num); + fwrite(buffer, sizeof(float), num, file); + fclose(file); + UNI_FREE(buffer); } -inline void readF32BinToF16(F16 *data, U32 num, CI8 *dataPath, CI8 *dataName) +inline void readF32BinToF16(F16 *data, U32 num, CI8 *directory, CI8 *name) { - std::string filePath = dataPath; - CI8 lastFlag = filePath[filePath.length() - 1]; - if (strcmp(&lastFlag, "/") != 0) { - filePath += "/"; - } - std::string fileName = dataName; - fileName = filePath + fileName; - FILE *infile = fopen(fileName.c_str(), "rb"); - if (infile == NULL) { - UNI_WARNING_LOG("can not read %s.\n", fileName.c_str()); + std::string path = std::string(directory) + std::string("/") + std::string(name); + FILE *file = fopen(path.c_str(), "rb"); + if (file == NULL) { + UNI_ERROR_LOG("can not read %s.\n", path.c_str()); return; } - F32 *dataTran = new F32[num]; - fread(dataTran, sizeof(float), num, infile); - for (U32 i = 0; i < num; i++) { - data[i] = (F16)dataTran[i]; - } - fclose(infile); - delete[] dataTran; + float *buffer = (float *)UNI_MALLOC(sizeof(float) * num); + fread(buffer, sizeof(float), num, file); + transformFromFloat(DT_F16, buffer, data, num); + fclose(file); + UNI_FREE(buffer); } #endif diff --git a/inference/engine/include/pooling.hpp b/inference/engine/include/pooling.hpp index 677a303f..1c0f8f7c 100644 --- a/inference/engine/include/pooling.hpp +++ b/inference/engine/include/pooling.hpp @@ -15,7 +15,6 @@ #define _POOLING_H #include "operator.hpp" -#include "tensor_computing.h" class Pooling : public Operator { public: diff --git a/inference/engine/include/preallocated_memory.hpp b/inference/engine/include/preallocated_memory.hpp index 6a909c54..d83befd3 100644 --- a/inference/engine/include/preallocated_memory.hpp +++ b/inference/engine/include/preallocated_memory.hpp @@ -18,10 +18,9 @@ class PreAllocatedMemory : public Operator { public: - PreAllocatedMemory(DataType dt, TensorDesc desc) + PreAllocatedMemory(PreAllocatedMemoryParamSpec p) { - this->dt = dt; - this->desc = desc; + this->p = p; } OperatorType get_type() override @@ -30,7 +29,7 @@ class PreAllocatedMemory : public Operator { } protected: - TensorDesc desc; + PreAllocatedMemoryParamSpec p; }; #endif // _PREALLOCATED_MEMORY_H diff --git a/inference/engine/include/prelu.hpp b/inference/engine/include/prelu.hpp index 0a0e504c..5e020f84 100644 --- a/inference/engine/include/prelu.hpp +++ b/inference/engine/include/prelu.hpp @@ -29,7 +29,7 @@ class PReLU : public WeightOperator { } protected: - PReLUParamSpec preluDesc; + PReLUParamSpec p; }; #endif // _PADDING_H diff --git a/inference/engine/include/prior_box.hpp b/inference/engine/include/prior_box.hpp index 4ee39b74..b93a00f5 100644 --- a/inference/engine/include/prior_box.hpp +++ b/inference/engine/include/prior_box.hpp @@ -44,8 +44,7 @@ class PriorBox : public Operator { EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { - CHECK_STATUS(priorbox_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo)); - return SUCCESS; + return priorbox_infer_output_size(inTensors, this->p, outTensors[0], &this->archInfo); } protected: diff --git a/inference/engine/include/quantizelinear.hpp b/inference/engine/include/quantizelinear.hpp new file mode 100644 index 00000000..2cf3246b --- /dev/null +++ b/inference/engine/include/quantizelinear.hpp @@ -0,0 +1,36 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _QUANTIZELINEAR_H +#define _QUANTIZELINEAR_H + +#include "operator.hpp" + +class QuantizeLinear : public Operator { +public: + QuantizeLinear(DataType dt, QuantizeLinearParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_QuantizeLinear; + } + +protected: + QuantizeLinearParamSpec p; +}; + +#endif // _QUANTIZELINEAR_H diff --git a/inference/engine/include/range.hpp b/inference/engine/include/range.hpp new file mode 100644 index 00000000..7d54ae90 --- /dev/null +++ b/inference/engine/include/range.hpp @@ -0,0 +1,35 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _RANGE_H +#define _RANGE_H + +#include "operator.hpp" + +class Range : public Operator { +public: + explicit Range(DataType dt, RangeParamSpec p) + { + this->dt = dt; + this->p = p; + } + + OperatorType get_type() override + { + return OT_Range; + } + +protected: + RangeParamSpec p; +}; +#endif // _RANGE_H diff --git a/inference/engine/include/reduction.hpp b/inference/engine/include/reduction.hpp index bca7f650..fa4f08e0 100644 --- a/inference/engine/include/reduction.hpp +++ b/inference/engine/include/reduction.hpp @@ -15,7 +15,6 @@ #define _REDUCTION_H #include "operator.hpp" -#include "tensor_computing.h" class Reduction : public Operator { public: @@ -41,10 +40,10 @@ class Reduction : public Operator { EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { - if (this->p.axes_num == 0) { + if (this->p.num_axes == 0) { TensorDesc desc = inTensors[0]->get_desc(); - this->p.axes_num = desc.nDims; - for (int i = 0; i < this->p.axes_num; i++) { + this->p.num_axes = desc.nDims; + for (int i = 0; i < this->p.num_axes; i++) { this->p.axes[i] = i; } } diff --git a/inference/engine/include/relative_position_embedding.hpp b/inference/engine/include/relative_position_embedding.hpp index 3af4d378..f660533f 100644 --- a/inference/engine/include/relative_position_embedding.hpp +++ b/inference/engine/include/relative_position_embedding.hpp @@ -53,29 +53,29 @@ class RelativePositionEmbedding : public EmbeddingCPU { U32 batch = inputDesc.dims[inputDesc.nDims - 1]; U32 length = inputDesc.dims[inputDesc.nDims - 1 - tmpAxis]; for (U32 in = 0; in < batch; in++) { - U8 *ptr = outputPtr + in * length * this->p.num_output * bytesOf(this->dt); - if (length > this->p.input_dim) { - U32 size = (length - this->p.input_dim) * this->p.num_output * bytesOf(this->dt); - memset(ptr, 0, size); + U8 *ptr = outputPtr + in * length * this->p.num_outputs * bytesOf(this->dt); + if (length > this->p.num_inputs) { + U32 size = (length - this->p.num_inputs) * this->p.num_outputs * bytesOf(this->dt); + UNI_MEMSET(ptr, 0, size); ptr += size; } U32 start = 0; - U32 copyLength = this->p.input_dim; - if (length < this->p.input_dim) { - start = this->p.input_dim - length; + U32 copyLength = this->p.num_inputs; + if (length < this->p.num_inputs) { + start = this->p.num_inputs - length; copyLength = length; } if (this->p.transpose) { for (U32 i = 0; i < copyLength; i++) { - for (U32 j = 0; j < this->p.num_output; j++) { - memcpy(ptr, - weightPtr + (j * this->p.input_dim + start + i) * bytesOf(this->dt), + for (U32 j = 0; j < this->p.num_outputs; j++) { + UNI_MEMCPY(ptr, + weightPtr + (j * this->p.num_inputs + start + i) * bytesOf(this->dt), bytesOf(this->dt)); } } } else { - memcpy(ptr, weightPtr + start * this->p.num_output * bytesOf(this->dt), - copyLength * this->p.num_output * bytesOf(this->dt)); + UNI_MEMCPY(ptr, weightPtr + start * this->p.num_outputs * bytesOf(this->dt), + copyLength * this->p.num_outputs * bytesOf(this->dt)); } } } @@ -87,7 +87,7 @@ class RelativePositionEmbedding : public EmbeddingCPU { I32 tmpAxis = (this->p.axis + inDim.nDims) % inDim.nDims; U32 batch = inDim.dims[inDim.nDims - 1]; U32 length = inDim.dims[inDim.nDims - 1 - tmpAxis]; - TensorDesc outDim = tensor3df(this->dt, DF_MTK, batch, length, this->p.num_output); + TensorDesc outDim = tensor3df(this->dt, DF_MTK, batch, length, this->p.num_outputs); outTensors[0]->resize(outDim); return SUCCESS; } diff --git a/inference/engine/include/relative_shift.hpp b/inference/engine/include/relative_shift.hpp index 74248d25..759d856a 100644 --- a/inference/engine/include/relative_shift.hpp +++ b/inference/engine/include/relative_shift.hpp @@ -50,7 +50,7 @@ class RelativeShift : public Operator { U32 length = inputDesc.dims[tmpAxis]; if (tmpAxis + 1 >= (I32)inputDesc.nDims) { U32 bytes = inputTensor.bytes(); - memcpy(outputPtr, inputPtr, bytes); + UNI_MEMCPY(outputPtr, inputPtr, bytes); return; } U32 loops = inputDesc.dims[tmpAxis + 1]; @@ -72,13 +72,13 @@ class RelativeShift : public Operator { (loops - this->p.shift_length) * (this->p.shift_length + length); U32 start = this->p.shift_length * length - num; U32 srcIndex = start * tileSize; - memcpy(dstPtr, srcPtr + srcIndex, num * tileSize); + UNI_MEMCPY(dstPtr, srcPtr + srcIndex, num * tileSize); dstPtr += num * tileSize; srcIndex += num * tileSize; for (U32 j = this->p.shift_length; j < loops; j++) { - memset(dstPtr, 0, this->p.shift_length * tileSize); + UNI_MEMSET(dstPtr, 0, this->p.shift_length * tileSize); dstPtr += this->p.shift_length * tileSize; - memcpy(dstPtr, srcPtr + srcIndex, chunkSize); + UNI_MEMCPY(dstPtr, srcPtr + srcIndex, chunkSize); dstPtr += chunkSize; srcIndex += chunkSize; } @@ -87,7 +87,7 @@ class RelativeShift : public Operator { srcPtr += this->p.shift_length * loops * tileSize; for (U32 j = 0; j < loops; j++) { for (U32 k = 0; k < klen; k++) { - memcpy(dstPtr, srcPtr, tileSize); + UNI_MEMCPY(dstPtr, srcPtr, tileSize); srcPtr += tileSize; dstPtr += tileSize; } diff --git a/inference/engine/include/resize.hpp b/inference/engine/include/resize.hpp index 981855b0..88da402f 100644 --- a/inference/engine/include/resize.hpp +++ b/inference/engine/include/resize.hpp @@ -18,14 +18,10 @@ class Resize : public Operator { public: - Resize(DataType paramDT, ResizeParamSpec p) + Resize(DataType dt, ResizeParamSpec p) { - if (paramDT == DT_F32 || paramDT == DT_U32) { - this->paramDT = paramDT; - this->p = p; - } else { - CHECK_STATUS(NOT_SUPPORTED); - } + this->dt = dt; + this->p = p; } OperatorType get_type() override @@ -34,7 +30,6 @@ class Resize : public Operator { } protected: - DataType paramDT; ResizeParamSpec p; }; diff --git a/inference/engine/include/roialign.hpp b/inference/engine/include/roialign.hpp index b171636d..d01f60be 100644 --- a/inference/engine/include/roialign.hpp +++ b/inference/engine/include/roialign.hpp @@ -15,12 +15,12 @@ #define _ROIALIGN_H #include "operator.hpp" -#include "tensor_computing.h" class RoIAlign : public Operator { public: - RoIAlign(RoIAlignParamSpec p) + RoIAlign(DataType dt, RoIAlignParamSpec p) { + this->dt = dt; this->p = p; } diff --git a/inference/engine/include/scale.hpp b/inference/engine/include/scale.hpp index 9ee5490a..d8f006b6 100644 --- a/inference/engine/include/scale.hpp +++ b/inference/engine/include/scale.hpp @@ -22,7 +22,6 @@ class Scale : public WeightOperator { { this->dt = dt; this->p = p; - this->numChannels = numChannels; this->dataID = 0; } @@ -39,44 +38,30 @@ class Scale : public WeightOperator { U32 find_target_axis_len(std::vector inTensors) { auto curOpWs = this->get_weightspec(); - U32 weightNum = 0; - U32 vecNum = 0; + int weightNum = 0; + int vecNum = 0; if (0 != curOpWs.bytes_of_weight) { weightNum = curOpWs.bytes_of_weight / UNI_MAX(1, bytesOf(curOpWs.mdt)); } else if (0 != curOpWs.bytes_of_vec) { vecNum = curOpWs.bytes_of_vec / UNI_MAX(1, bytesOf(curOpWs.mdt)); } if (weightNum > 0 && vecNum > 0 && weightNum != vecNum) { - CHECK_STATUS(NOT_MATCH); + UNI_ERROR_LOG( + "scale alpha length(%d) is not equal to beta length(%d).\n", weightNum, vecNum); } - this->numChannels = (weightNum) ? weightNum : vecNum; + int numChannels = (weightNum) ? weightNum : vecNum; if (weightNum == 0 && vecNum == 0) { if (inTensors.size() == 1) { - CHECK_STATUS(NOT_MATCH); + UNI_ERROR_LOG("scale doesn't have alpha or beta.\n"); } TensorDesc desc = inTensors[1 - dataID]->get_desc(); - this->numChannels = tensorNumElements(desc); + numChannels = tensorNumElements(desc); } - - TensorDesc inputDesc = inTensors[dataID]->get_desc(); - U32 axisLen = this->numChannels; - I32 axis = p.axis; - U32 nDims = inputDesc.nDims; - axis = (nDims + axis) % nDims; - axis = nDims - 1 - axis; - if (axisLen != inputDesc.dims[axis]) { - for (U32 i = 0; i < nDims; i++) { - if (inputDesc.dims[nDims - 1 - i] == axisLen) { - p.axis = i; - } - } - } - return axisLen; + return numChannels; } protected: ScaleParamSpec p; - U32 numChannels; int dataID; }; diff --git a/inference/engine/include/select.hpp b/inference/engine/include/select.hpp index 06d0da04..c552c11c 100644 --- a/inference/engine/include/select.hpp +++ b/inference/engine/include/select.hpp @@ -27,10 +27,5 @@ class Select : public Operator { { return OT_Select; } - - bool can_input_output_the_same() override - { - return false; - } }; #endif // _SELECT_H diff --git a/inference/engine/include/weight_operator.hpp b/inference/engine/include/weight_operator.hpp index a5770a36..7d4d1488 100644 --- a/inference/engine/include/weight_operator.hpp +++ b/inference/engine/include/weight_operator.hpp @@ -15,7 +15,6 @@ #define _WEIGHTOPERATOR_H #include "operator.hpp" -#include "tensor_computing.h" #include "model_spec.h" class WeightOperator : public Operator { @@ -118,7 +117,7 @@ class WeightOperator : public Operator { return SUCCESS; } - virtual EE init_weight_bias_from_model(std::shared_ptr *modelPtr) + virtual EE init_weight_bias_from_model(std::shared_ptr *modelPtr = nullptr) { EE ret = this->infer_weight_desc(); if (ret != SUCCESS) { @@ -151,6 +150,15 @@ class WeightOperator : public Operator { weight_offset += tensorNumBytes(desc); } + if (curOpWs.num_quant_scale == this->weightTensors.size()) { + for (U32 i = 0; i < this->weightTensors.size(); ++i) { + if (curOpWs.weight_scale[i].num_scale > 0) { + this->weightTensors[i].set_scale_ptr( + std::shared_ptr(curOpWs.weight_scale[i].scale, [](F32 *) {})); + } + } + } + U32 bias_offset = (modelPtr != nullptr) ? weight_offset : 0; if (this->hasBias) { for (auto bias_tensor : this->biasTensors) { @@ -169,7 +177,7 @@ class WeightOperator : public Operator { bias_mem_src.resize(desc); bias_mem_src.alloc(); U8 *tmp = (U8 *)bias_mem_src.get_ptr(); - memset(tmp, 0, bias_mem_src.bytes()); + UNI_MEMSET(tmp, 0, bias_mem_src.bytes()); bias_mem_dst->reuse(&bias_mem_src); } } diff --git a/inference/engine/include/where.hpp b/inference/engine/include/where.hpp index a22f1221..5ec80270 100644 --- a/inference/engine/include/where.hpp +++ b/inference/engine/include/where.hpp @@ -14,9 +14,9 @@ #ifndef _WHERE_H #define _WHERE_H -#include "weight_operator.hpp" +#include "operator.hpp" -class Where : public WeightOperator { +class Where : public Operator { public: Where(DataType dt) { @@ -27,11 +27,6 @@ class Where : public WeightOperator { { return OT_Where; } - - bool can_input_output_the_same() override - { - return false; - } }; #endif // _WHERE_H diff --git a/inference/engine/include/yolov3_detection_output.hpp b/inference/engine/include/yolov3_detection_output.hpp index 1c4f6188..aa3cb678 100644 --- a/inference/engine/include/yolov3_detection_output.hpp +++ b/inference/engine/include/yolov3_detection_output.hpp @@ -46,9 +46,8 @@ class Yolov3DetectionOutput : public Operator { EE infer_output_tensors_size( std::vector inTensors, std::vector outTensors) override { - CHECK_STATUS(yolov3detectionoutput_infer_output_size( - inTensors, this->p, outTensors[0], &this->archInfo)); - return SUCCESS; + return yolov3detectionoutput_infer_output_size( + inTensors, this->p, outTensors[0], &this->archInfo); } protected: diff --git a/inference/engine/src/BoltModel_Jni.cpp b/inference/engine/src/BoltModel_Jni.cpp index 5aa8f14a..32957760 100644 --- a/inference/engine/src/BoltModel_Jni.cpp +++ b/inference/engine/src/BoltModel_Jni.cpp @@ -48,6 +48,8 @@ AFFINITY_TYPE str2AFFINITY_TYPE(std::string affinity_str) ret = CPU_LOW_POWER; } else if (affinity_str == "GPU") { ret = GPU; + } else if (affinity_str == "CPU") { + ret = CPU; } else { UNI_ERROR_LOG("unsupported JNI CPU affinity setting %s.\n", affinity_str.c_str()); } @@ -84,7 +86,7 @@ DATA_TYPE str2DATA_TYPE(std::string data_type) DATA_TYPE ret = FP_32; if (data_type == "FP32") { ret = FP_32; -#ifdef __aarch64__ +#ifdef _USE_FP16 } else if (data_type == "FP16") { ret = FP_16; #endif @@ -186,13 +188,13 @@ void getInputParameters(JNIEnv *env, UNI_ERROR_LOG("input DataFormat array length %d is not equal to input num %d\n", env->GetArrayLength(df_input), num); } - int *data_n = (int *)malloc(num * sizeof(int)); - int *data_c = (int *)malloc(num * sizeof(int)); - int *data_h = (int *)malloc(num * sizeof(int)); - int *data_w = (int *)malloc(num * sizeof(int)); - char **data_name = (char **)malloc(num * sizeof(char *)); - DATA_TYPE *data_dt = (DATA_TYPE *)malloc(num * sizeof(DATA_TYPE)); - DATA_FORMAT *data_df = (DATA_FORMAT *)malloc(num * sizeof(DATA_FORMAT)); + int *data_n = (int *)UNI_MALLOC(num * sizeof(int)); + int *data_c = (int *)UNI_MALLOC(num * sizeof(int)); + int *data_h = (int *)UNI_MALLOC(num * sizeof(int)); + int *data_w = (int *)UNI_MALLOC(num * sizeof(int)); + char **data_name = (char **)UNI_MALLOC(num * sizeof(char *)); + DATA_TYPE *data_dt = (DATA_TYPE *)UNI_MALLOC(num * sizeof(DATA_TYPE)); + DATA_FORMAT *data_df = (DATA_FORMAT *)UNI_MALLOC(num * sizeof(DATA_FORMAT)); jint *curArray_n = env->GetIntArrayElements(n, 0); jint *curArray_c = env->GetIntArrayElements(c, 0); jint *curArray_h = env->GetIntArrayElements(h, 0); @@ -206,7 +208,7 @@ void getInputParameters(JNIEnv *env, jstring cur_str = (jstring)(env->GetObjectArrayElement(input_names, i)); const char *cur_str_ptr = env->GetStringUTFChars(cur_str, 0); int length = strlen(cur_str_ptr); - data_name[i] = (char *)malloc(sizeof(char) * (length + 1)); + data_name[i] = (char *)UNI_MALLOC(sizeof(char) * (length + 1)); UNI_MEMCPY(data_name[i], cur_str_ptr, length); data_name[i][length] = '\0'; @@ -265,9 +267,9 @@ extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_createModel)( const char *affinityPtr = env->GetStringUTFChars(affinity, JNI_FALSE); std::string affinity_str = (std::string)affinityPtr; AFFINITY_TYPE affinity_cur = str2AFFINITY_TYPE(affinity_str); - long modelAddr = (long)CreateModel(modelPathPtr, affinity_cur, NULL); + long modelAddr = (long long)CreateModel(modelPathPtr, affinity_cur, NULL); ModelHandleInfo *ihInfo = (ModelHandleInfo *)modelAddr; - if (nullptr == ihInfo->cnn) { + if (nullptr == ihInfo) { UNI_ERROR_LOG("Bolt instance not created\n"); modelAddr = 0; } @@ -283,7 +285,7 @@ extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_cloneModel)( UNI_DEBUG_LOG("JNI %s...\n", __FUNCTION__); ModelHandle handle = (ModelHandle)modelAddr; ModelHandle cloneHandle = CloneModel(handle); - long ret = (long)cloneHandle; + long ret = (long long)cloneHandle; UNI_DEBUG_LOG("JNI %s end.\n", __FUNCTION__); return ret; } @@ -315,16 +317,16 @@ extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_prepareModel)(JNIEnv PrepareModel( ih, num_input, (const char **)data_name, data_n, data_c, data_h, data_w, data_dt, data_df); - free(data_n); - free(data_c); - free(data_h); - free(data_w); + UNI_FREE(data_n); + UNI_FREE(data_c); + UNI_FREE(data_h); + UNI_FREE(data_w); for (int i = 0; i < num_input; i++) { - free(data_name[i]); + UNI_FREE(data_name[i]); } - free(data_name); - free(data_dt); - free(data_df); + UNI_FREE(data_name); + UNI_FREE(data_dt); + UNI_FREE(data_df); UNI_DEBUG_LOG("JNI %s end.\n", __FUNCTION__); } @@ -355,16 +357,16 @@ extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_resizeModelInput)(JN ResizeModelInput( ih, num_input, (const char **)data_name, data_n, data_c, data_h, data_w, data_dt, data_df); - free(data_n); - free(data_c); - free(data_h); - free(data_w); + UNI_FREE(data_n); + UNI_FREE(data_c); + UNI_FREE(data_h); + UNI_FREE(data_w); for (int i = 0; i < num_input; i++) { - free(data_name[i]); + UNI_FREE(data_name[i]); } - free(data_name); - free(data_dt); - free(data_df); + UNI_FREE(data_name); + UNI_FREE(data_dt); + UNI_FREE(data_df); UNI_DEBUG_LOG("JNI %s end.\n", __FUNCTION__); } @@ -375,7 +377,7 @@ extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_allocAllResultHandl ModelHandle ih = (ModelHandle)modelAddr; ResultHandle ir = AllocAllResultHandle(ih); UNI_DEBUG_LOG("JNI %s end.\n", __FUNCTION__); - return (long)ir; + return (long long)ir; } extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_allocSpecificResultHandle)( @@ -387,12 +389,12 @@ extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_allocSpecificResult env->GetArrayLength(outputNames), num_outputs); } ModelHandle ih = (ModelHandle)modelAddr; - char **output_names_ptr = (char **)malloc(sizeof(char *) * num_outputs); + char **output_names_ptr = (char **)UNI_MALLOC(sizeof(char *) * num_outputs); for (int i = 0; i < num_outputs; i++) { jstring cur_str = (jstring)(env->GetObjectArrayElement(outputNames, i)); const char *cur_str_ptr = env->GetStringUTFChars(cur_str, 0); int length = strlen(cur_str_ptr); - output_names_ptr[i] = (char *)malloc(sizeof(char) * (length + 1)); + output_names_ptr[i] = (char *)UNI_MALLOC(sizeof(char) * (length + 1)); UNI_MEMCPY(output_names_ptr[i], cur_str_ptr, length); output_names_ptr[i][length] = '\0'; @@ -402,11 +404,11 @@ extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_allocSpecificResult ResultHandle ir = AllocSpecificResultHandle(ih, num_outputs, (const char **)output_names_ptr); for (int i = 0; i < num_outputs; i++) { - free(output_names_ptr[i]); + UNI_FREE(output_names_ptr[i]); } - free(output_names_ptr); + UNI_FREE(output_names_ptr); UNI_DEBUG_LOG("JNI %s end.\n", __FUNCTION__); - return (long)ir; + return (long long)ir; } extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_setRuntimeDeviceJNI)( @@ -462,13 +464,13 @@ extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_runModel)(JNIEnv *en CNN *cnn = (CNN *)ihInfo->cnn; std::map> inMap = cnn->get_input(); - char **input_names_ptr = (char **)malloc(sizeof(char *) * num_input); - void **mem_ptr = (void **)malloc(sizeof(void *) * num_input); + char **input_names_ptr = (char **)UNI_MALLOC(sizeof(char *) * num_input); + void **mem_ptr = (void **)UNI_MALLOC(sizeof(void *) * num_input); for (int i = 0; i < num_input; i++) { jstring cur_str = (jstring)(env->GetObjectArrayElement(input_names, i)); const char *cur_str_ptr = env->GetStringUTFChars(cur_str, 0); int length = strlen(cur_str_ptr); - input_names_ptr[i] = (char *)malloc(sizeof(char) * (length + 1)); + input_names_ptr[i] = (char *)UNI_MALLOC(sizeof(char) * (length + 1)); UNI_MEMCPY(input_names_ptr[i], cur_str_ptr, length); input_names_ptr[i][length] = '\0'; env->ReleaseStringUTFChars(cur_str, cur_str_ptr); @@ -488,10 +490,10 @@ extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_runModel)(JNIEnv *en RunModel(ih, ir, num_input, (const char **)input_names_ptr, mem_ptr); for (int i = 0; i < num_input; i++) { - free(input_names_ptr[i]); + UNI_FREE(input_names_ptr[i]); } - free(input_names_ptr); - free(mem_ptr); + UNI_FREE(input_names_ptr); + UNI_FREE(mem_ptr); UNI_DEBUG_LOG("JNI %s end.\n", __FUNCTION__); } @@ -583,7 +585,7 @@ extern "C" JNIEXPORT jlong JNICALL BOLT_JNI_PREFIX(BoltModel_cloneResultHandle)( UNI_DEBUG_LOG("JNI %s...\n", __FUNCTION__); ResultHandle ir = (ResultHandle)ResultHandleAddr; UNI_DEBUG_LOG("JNI %s end.\n", __FUNCTION__); - return (long)CloneResultHandle(ir); + return (long long)CloneResultHandle(ir); } extern "C" JNIEXPORT void JNICALL BOLT_JNI_PREFIX(BoltModel_freeResultHandle)( diff --git a/inference/engine/src/CMakeLists.txt b/inference/engine/src/CMakeLists.txt index aae57281..7f48bdc6 100644 --- a/inference/engine/src/CMakeLists.txt +++ b/inference/engine/src/CMakeLists.txt @@ -7,6 +7,9 @@ add_library(${PROJECT_NAME} SHARED ${srcs}) add_library(${PROJECT_NAME}_static STATIC ${srcs}) target_link_libraries(${PROJECT_NAME} LINK_PUBLIC tensor image model_spec) +if (USE_SECURE_C) + target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${SecureC_SHARED_LIBRARY}) +endif () if (BUILD_TEST) target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${JPEG_SHARED_LIBRARY}) endif (BUILD_TEST) diff --git a/inference/engine/src/bolt.cpp b/inference/engine/src/bolt.cpp index f1f0700e..12218319 100644 --- a/inference/engine/src/bolt.cpp +++ b/inference/engine/src/bolt.cpp @@ -11,8 +11,8 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -#include "inference.hpp" #include "../api/c/bolt.h" +#include "inference.hpp" #define NAME_VALUE_PAIR(x) #x, x const int DataDescMaxDims = 8; @@ -39,14 +39,14 @@ typedef struct { DEVICE_TYPE deviceType; } ResultHandleInner; -inline DataType DATA_TYPE2DataType(DATA_TYPE dt_user) +inline static DataType DATA_TYPE2DataType(DATA_TYPE dt_user) { DataType ret = DT_F32; switch (dt_user) { case FP_32: ret = DT_F32; break; -#ifdef __aarch64__ +#ifdef _USE_FP16 case FP_16: ret = DT_F16; break; @@ -64,14 +64,14 @@ inline DataType DATA_TYPE2DataType(DATA_TYPE dt_user) return ret; } -inline DATA_TYPE DataType2DATA_TYPE(DataType dt_bolt) +inline static DATA_TYPE DataType2DATA_TYPE(DataType dt_bolt) { DATA_TYPE ret = FP_32; switch (dt_bolt) { case DT_F32: ret = FP_32; break; -#ifdef __aarch64__ +#ifdef _USE_FP16 case DT_F16: ret = FP_16; break; @@ -89,7 +89,7 @@ inline DATA_TYPE DataType2DATA_TYPE(DataType dt_bolt) return ret; } -inline DataFormat DATA_FORMAT2DataFormat(DATA_FORMAT df_user) +inline static DataFormat DATA_FORMAT2DataFormat(DATA_FORMAT df_user) { DataFormat ret = DF_NCHW; switch (df_user) { @@ -116,7 +116,7 @@ inline DataFormat DATA_FORMAT2DataFormat(DATA_FORMAT df_user) return ret; } -inline DATA_FORMAT DataFormat2DATA_FORMAT(DataFormat df_bolt) +inline static DATA_FORMAT DataFormat2DATA_FORMAT(DataFormat df_bolt) { DATA_FORMAT ret = NCHW; switch (df_bolt) { @@ -146,10 +146,13 @@ inline DATA_FORMAT DataFormat2DATA_FORMAT(DataFormat df_bolt) return ret; } -inline AffinityPolicy AFFINITY_TYPE2AffinityPolicy(AFFINITY_TYPE affinity) +inline static AffinityPolicy AFFINITY_TYPE2AffinityPolicy(AFFINITY_TYPE affinity) { AffinityPolicy ret = AFFINITY_CPU_HIGH_PERFORMANCE; switch (affinity) { + case CPU: + ret = AFFINITY_CPU; + break; case CPU_HIGH_PERFORMANCE: ret = AFFINITY_CPU_HIGH_PERFORMANCE; break; @@ -167,7 +170,7 @@ inline AffinityPolicy AFFINITY_TYPE2AffinityPolicy(AFFINITY_TYPE affinity) return ret; } -inline Arch DEVICE_TYPE2Arch(DEVICE_TYPE device) +inline static Arch DEVICE_TYPE2Arch(DEVICE_TYPE device) { Arch ret = ARM_V8; switch (device) { @@ -192,6 +195,9 @@ inline Arch DEVICE_TYPE2Arch(DEVICE_TYPE device) case CPU_X86_AVX2: ret = X86_AVX2; break; + case CPU_X86_AVX512: + ret = X86_AVX512; + break; case CPU_SERIAL: ret = CPU_GENERAL; break; @@ -203,7 +209,7 @@ inline Arch DEVICE_TYPE2Arch(DEVICE_TYPE device) return ret; } -inline DEVICE_TYPE Arch2DEVICE_TYPE(Arch arch) +inline static DEVICE_TYPE Arch2DEVICE_TYPE(Arch arch) { DEVICE_TYPE ret = CPU_ARM_V8; switch (arch) { @@ -242,7 +248,7 @@ inline DEVICE_TYPE Arch2DEVICE_TYPE(Arch arch) return ret; } -void TensorDesc2DataDesc(TensorDesc srcDesc, DataDesc *dstDesc) +inline static void TensorDesc2DataDesc(TensorDesc srcDesc, DataDesc *dstDesc) { dstDesc->dt = srcDesc.dt; dstDesc->df = srcDesc.df; @@ -258,102 +264,155 @@ void TensorDesc2DataDesc(TensorDesc srcDesc, DataDesc *dstDesc) } } -void assert_not_nullptr(const char *funcName, const char *ptrName, const void *ptr) +inline static void assert_not_nullptr(const char *funcName, const char *ptrName, const void *ptr) { if (ptr == NULL) { - UNI_ERROR_LOG("C API %s received null ptr %s.\n", funcName, ptrName); + UNI_WARNING_LOG("C API %s received null ptr %s.\n", funcName, ptrName); } } +static void print_model_handle(ModelHandleInner *handle) +{ + if (handle == nullptr) { + UNI_DEBUG_LOG("ModelHandle %p\n", handle); + } else { + UNI_DEBUG_LOG("ModelHandle %p(modelspec:%p engine:%p device:%d algorithm:%s file " + "stream:%d)\n", + handle, handle->ms, handle->cnn, handle->deviceType, (const char *)handle->algoPath, + handle->useFileStream); + } +} + +static void print_result_handle(ResultHandleInner *handle) +{ + if (handle == nullptr) { + UNI_DEBUG_LOG("ResultHandle %p\n", handle); + } else { + UNI_DEBUG_LOG("ResultHandle %p(num:%u data:%p device:%d)\n", handle, handle->num_outputs, + handle->outputArr, handle->deviceType); + } +} + +const char *const *GetDataTypeString() +{ + static const char *const names[] = {"FP_32", "FP_16", "INT_32", "UINT_32"}; + return names; +} + +const char *const *GetDataFormatString() +{ + static const char *const names[] = {"NCHW", "NHWC", "NCHWC8", "MTK", "NORMAL"}; + return names; +} + void GetGpuDeviceName(char *gpuDeviceName) { - UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p)...\n", __FUNCTION__, gpuDeviceName); std::string deviceName = "unKnown"; #ifdef _USE_GPU deviceName = OCLContext::getInstance().handle->deviceName; #endif - strcpy(gpuDeviceName, deviceName.c_str()); - UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__); + UNI_STRCPY(gpuDeviceName, deviceName.c_str()); + UNI_DEBUG_LOG("C API %s(%s) end.\n", __FUNCTION__, gpuDeviceName); } ModelHandle CreateModel(const char *modelPath, AFFINITY_TYPE affinity, const char *algorithmMapPath) { - UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p, %d, %p)...\n", __FUNCTION__, modelPath, affinity, algorithmMapPath); assert_not_nullptr(__FUNCTION__, NAME_VALUE_PAIR(modelPath)); - ModelHandleInner *handle = new ModelHandleInner(); - ModelSpec *ms = new ModelSpec(); - if (SUCCESS != deserialize_model_from_file(modelPath, ms)) { - UNI_ERROR_LOG("C API %s failed to load model %s.\n", __FUNCTION__, modelPath); - delete ms; - handle->cnn = nullptr; - return (ModelHandle)handle; - } - CNN *cnn = new CNN(AFFINITY_TYPE2AffinityPolicy(affinity), ms->dt, ms->model_name); - cnn->sort_operators_sequential(ms); - cnn->initialize_ops(ms); - - handle->cnn = (void *)cnn; - handle->ms = (void *)ms; - handle->deviceType = Arch2DEVICE_TYPE(cnn->get_runtime_device()); - handle->algoPath = (void *)algorithmMapPath; - handle->useFileStream = false; - UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__); + ModelHandleInner *handle = nullptr; + if (modelPath != nullptr) { + ModelSpec *ms = new ModelSpec(); + if (SUCCESS != deserialize_model_from_file(modelPath, ms)) { + UNI_WARNING_LOG("C API %s failed to load model %s.\n", __FUNCTION__, modelPath); + delete ms; + } else { + CNN *cnn = new CNN(AFFINITY_TYPE2AffinityPolicy(affinity), ms->dt, ms->model_name); + cnn->sort_operators_sequential(ms); + cnn->initialize_ops(ms); + + handle = new ModelHandleInner(); + handle->cnn = (void *)cnn; + handle->ms = (void *)ms; + handle->deviceType = Arch2DEVICE_TYPE(cnn->get_runtime_device()); + handle->algoPath = (void *)algorithmMapPath; + handle->useFileStream = false; + } + } + UNI_DEBUG_LOG("C API %s(%p) end.\n", __FUNCTION__, handle); + print_model_handle(handle); return (ModelHandle)handle; } ModelHandle CloneModel(ModelHandle ih) { - UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p)...\n", __FUNCTION__, ih); ModelHandleInner *handle = (ModelHandleInner *)ih; + print_model_handle(handle); assert_not_nullptr(__FUNCTION__, "ModelHandle", handle); - CNN *cnn = (CNN *)handle->cnn; - assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn); - ModelHandleInner *cloneHandle = new ModelHandleInner(); - *cloneHandle = *handle; - CNN *cloneCnn = new CNN(); - *cloneCnn = cnn->clone(); - cloneHandle->cnn = cloneCnn; - UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__); + ModelHandleInner *cloneHandle = nullptr; + if (handle != nullptr) { + CNN *cnn = (CNN *)handle->cnn; + assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn); + if (cnn != nullptr) { + cloneHandle = new ModelHandleInner(); + *cloneHandle = *handle; + CNN *cloneCnn = new CNN(); + *cloneCnn = cnn->clone(); + cloneHandle->cnn = cloneCnn; + } + } + UNI_DEBUG_LOG("C API %s(%p) end.\n", __FUNCTION__, cloneHandle); + print_model_handle(cloneHandle); return (ModelHandle)cloneHandle; } ModelHandle CreateModelWithFileStream( const char *modelFileStream, AFFINITY_TYPE affinity, const char *algorithmMapFileStream) { - UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p, %d, %p)...\n", __FUNCTION__, modelFileStream, affinity, + algorithmMapFileStream); assert_not_nullptr(__FUNCTION__, NAME_VALUE_PAIR(modelFileStream)); - ModelHandleInner *handle = new ModelHandleInner(); - ModelSpec *ms = new ModelSpec(); - if (SUCCESS != deserialize_model_from_file(modelFileStream, ms, true)) { - UNI_ERROR_LOG("C API %s failed to parse model.\n", __FUNCTION__); - delete ms; - handle->cnn = nullptr; - return (ModelHandle)handle; - } - CNN *cnn = new CNN(AFFINITY_TYPE2AffinityPolicy(affinity), ms->dt, ms->model_name); - cnn->sort_operators_sequential(ms); - cnn->initialize_ops(ms); - - handle->cnn = (void *)cnn; - handle->ms = (void *)ms; - handle->deviceType = Arch2DEVICE_TYPE(cnn->get_runtime_device()); - handle->algoPath = (void *)algorithmMapFileStream; - handle->useFileStream = true; - UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__); + ModelHandleInner *handle = nullptr; + if (modelFileStream != nullptr) { + ModelSpec *ms = new ModelSpec(); + if (SUCCESS != deserialize_model_from_file(modelFileStream, ms, true)) { + UNI_WARNING_LOG("C API %s failed to parse model.\n", __FUNCTION__); + delete ms; + } else { + CNN *cnn = new CNN(AFFINITY_TYPE2AffinityPolicy(affinity), ms->dt, ms->model_name); + cnn->sort_operators_sequential(ms); + cnn->initialize_ops(ms); + + handle = new ModelHandleInner(); + handle->cnn = (void *)cnn; + handle->ms = (void *)ms; + handle->deviceType = Arch2DEVICE_TYPE(cnn->get_runtime_device()); + handle->algoPath = (void *)algorithmMapFileStream; + handle->useFileStream = true; + } + } + UNI_DEBUG_LOG("C API %s(%p) end.\n", __FUNCTION__, handle); + print_model_handle(handle); return (ModelHandle)handle; } int GetNumInputsFromModel(ModelHandle ih) { - UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p)...\n", __FUNCTION__, ih); ModelHandleInner *ihInfo = (ModelHandleInner *)ih; assert_not_nullptr(__FUNCTION__, "ModelHandle", ihInfo); - - CNN *cnn = (CNN *)ihInfo->cnn; - assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn); - UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__); - - return (cnn->get_input_desc()).size(); + print_model_handle(ihInfo); + int ret = 0; + if (ihInfo != nullptr) { + CNN *cnn = (CNN *)ihInfo->cnn; + assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn); + if (cnn != nullptr) { + ret = (cnn->get_input_desc()).size(); + } + } + UNI_DEBUG_LOG("C API %s(%d) end.\n", __FUNCTION__, ret); + return ret; } void GetInputDataInfoFromModel(ModelHandle ih, @@ -382,9 +441,11 @@ void GetInputDataInfoFromModel5D(ModelHandle handle, DATA_TYPE *dt, DATA_FORMAT *df) { - UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p, %d, %p, %p, %p, %p, %p, %p, %p, %p)...\n", __FUNCTION__, handle, + num_inputs, name, n, c, t, h, w, dt, df); ModelHandleInner *ihInfo = (ModelHandleInner *)handle; assert_not_nullptr(__FUNCTION__, "ModelHandle", ihInfo); + print_model_handle(ihInfo); CNN *cnn = (CNN *)ihInfo->cnn; assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn); @@ -411,7 +472,7 @@ void GetInputDataInfoFromModel5D(ModelHandle handle, U32 in, ic, it, ih, iw; int i = 0; for (auto iter : inputTensorDescs) { - strcpy(name[i], iter.first.c_str()); + UNI_STRCPY(name[i], iter.first.c_str()); TensorDesc desc = iter.second; in = ic = it = ih = iw = 1; if (tensorIs1d(desc)) { @@ -440,7 +501,7 @@ void GetInputDataInfoFromModel5D(ModelHandle handle, UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__); } -std::map getInputDataFormatFromUser(ModelHandle ih, +static std::map getInputDataFormatFromUser(ModelHandle ih, int num_inputs, const char **name, const int *n, @@ -481,7 +542,7 @@ std::map getInputDataFormatFromUser(ModelHandle ih, if (inputTensorDescs.find(inputName) == inputTensorDescs.end()) { UNI_ERROR_LOG( "C API inner function %s received %s is not model input.\n", __FUNCTION__, name[i]); - exit(1); + continue; } DataType idt = DATA_TYPE2DataType(dt[i]); DataFormat idf = DATA_FORMAT2DataFormat(df[i]); @@ -525,7 +586,7 @@ void PrepareModel(ModelHandle ih, } void PrepareModel5D(ModelHandle ih, - int num_input, + int num_inputs, const char **name, const int *n, const int *c, @@ -535,22 +596,25 @@ void PrepareModel5D(ModelHandle ih, const DATA_TYPE *dt, const DATA_FORMAT *df) { - UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p, %d, %p, %p, %p, %p, %p, %p, %p, %p)...\n", __FUNCTION__, ih, + num_inputs, name, n, c, t, h, w, dt, df); ModelHandleInner *ihInfo = (ModelHandleInner *)ih; assert_not_nullptr(__FUNCTION__, "ModelHandle", ihInfo); + print_model_handle(ihInfo); CNN *cnn = (CNN *)ihInfo->cnn; assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn); std::map modelInputDims = - getInputDataFormatFromUser(ih, num_input, name, n, c, t, h, w, dt, df); - cnn->loadAlgorithmMap((const char *)ihInfo->algoPath, ihInfo->useFileStream); + getInputDataFormatFromUser(ih, num_inputs, name, n, c, t, h, w, dt, df); + //cnn->loadAlgorithmMap((const char *)ihInfo->algoPath, ihInfo->useFileStream); cnn->ready(modelInputDims); cnn->mark_input_output(); ModelSpec *ms = (ModelSpec *)ihInfo->ms; CHECK_STATUS(mt_destroy_model(ms)); delete ms; + ihInfo->ms = nullptr; UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__); } @@ -564,9 +628,12 @@ void ResizeModelInput(ModelHandle ih, const DATA_TYPE *dt, const DATA_FORMAT *df) { - UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p, %d, %p, %p, %p, %p, %p, %p, %p)...\n", __FUNCTION__, ih, num_inputs, + name, n, c, h, w, dt, df); ModelHandleInner *ihInfo = (ModelHandleInner *)ih; assert_not_nullptr(__FUNCTION__, "ModelHandle", ihInfo); + print_model_handle(ihInfo); + CNN *cnn = (CNN *)ihInfo->cnn; assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn); @@ -578,67 +645,80 @@ void ResizeModelInput(ModelHandle ih, ResultHandle AllocAllResultHandle(ModelHandle ih) { - UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p)...\n", __FUNCTION__, ih); ModelHandleInner *ihInfo = (ModelHandleInner *)ih; assert_not_nullptr(__FUNCTION__, "ModelHandle", ihInfo); - CNN *cnn = (CNN *)ihInfo->cnn; - assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn); - - ResultHandleInner *model_result_ptr = (ResultHandleInner *)malloc(sizeof(ResultHandleInner)); - std::map outputTensorDescs = cnn->get_output_desc(); - int num_outputs = outputTensorDescs.size(); - DataDesc *outputArrPtr = (DataDesc *)malloc(sizeof(DataDesc) * num_outputs); - int i = 0; - for (auto iter : outputTensorDescs) { - std::string name = iter.first; - U32 length = name.size(); - length = (length > NAME_LEN) ? NAME_LEN : length; - memcpy(outputArrPtr[i].name, name.c_str(), length); - if (length < NAME_LEN) { - outputArrPtr[i].name[length] = '\0'; + print_model_handle(ihInfo); + ResultHandleInner *model_result_ptr = nullptr; + if (ihInfo != nullptr) { + CNN *cnn = (CNN *)ihInfo->cnn; + assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn); + if (cnn != nullptr) { + model_result_ptr = (ResultHandleInner *)UNI_MALLOC(sizeof(ResultHandleInner)); + std::map outputTensorDescs = cnn->get_output_desc(); + int num_outputs = outputTensorDescs.size(); + DataDesc *outputArrPtr = (DataDesc *)UNI_MALLOC(sizeof(DataDesc) * num_outputs); + int i = 0; + for (auto iter : outputTensorDescs) { + std::string name = iter.first; + U32 length = name.size(); + length = (length > NAME_LEN) ? NAME_LEN : length; + UNI_MEMCPY(outputArrPtr[i].name, name.c_str(), length); + if (length < NAME_LEN) { + outputArrPtr[i].name[length] = '\0'; + } + TensorDesc2DataDesc(iter.second, &outputArrPtr[i]); + i++; + } + model_result_ptr->num_outputs = num_outputs; + model_result_ptr->outputArr = outputArrPtr; + model_result_ptr->deviceType = ihInfo->deviceType; } - TensorDesc2DataDesc(iter.second, &outputArrPtr[i]); - i++; } - model_result_ptr->num_outputs = num_outputs; - model_result_ptr->outputArr = outputArrPtr; - model_result_ptr->deviceType = ihInfo->deviceType; - UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__); - return (void *)model_result_ptr; + UNI_DEBUG_LOG("C API %s(%p) end.\n", __FUNCTION__, model_result_ptr); + print_result_handle(model_result_ptr); + return (ResultHandle)model_result_ptr; } ResultHandle AllocSpecificResultHandle(ModelHandle ih, int num_outputs, const char **name) { - UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p, %d, %p)...\n", __FUNCTION__, ih, num_outputs, name); ModelHandleInner *ihInfo = (ModelHandleInner *)ih; assert_not_nullptr(__FUNCTION__, "ModelHandle", ihInfo); - CNN *cnn = (CNN *)ihInfo->cnn; - assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn); - - ResultHandleInner *model_result_ptr = (ResultHandleInner *)malloc(sizeof(ResultHandleInner)); - int model_num_outputs = num_outputs; - DataDesc *outputArrPtr = (DataDesc *)malloc(sizeof(DataDesc) * model_num_outputs); - for (int i = 0; i < num_outputs; i++) { - U32 length = UNI_MIN(strlen(name[i]), NAME_LEN - 1); - memcpy(outputArrPtr[i].name, name[i], length); - if (length < NAME_LEN) { - outputArrPtr[i].name[length] = '\0'; + print_model_handle(ihInfo); + ResultHandleInner *model_result_ptr = nullptr; + if (ihInfo != nullptr) { + CNN *cnn = (CNN *)ihInfo->cnn; + assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn); + if (cnn != nullptr) { + model_result_ptr = (ResultHandleInner *)UNI_MALLOC(sizeof(ResultHandleInner)); + int model_num_outputs = num_outputs; + DataDesc *outputArrPtr = (DataDesc *)UNI_MALLOC(sizeof(DataDesc) * model_num_outputs); + for (int i = 0; i < num_outputs; i++) { + U32 length = UNI_MIN(strlen(name[i]), NAME_LEN - 1); + UNI_MEMCPY(outputArrPtr[i].name, name[i], length); + if (length < NAME_LEN) { + outputArrPtr[i].name[length] = '\0'; + } + TensorDesc srcDesc = cnn->get_tensor_desc_by_name(name[i]); + TensorDesc2DataDesc(srcDesc, &outputArrPtr[i]); + } + model_result_ptr->num_outputs = model_num_outputs; + model_result_ptr->outputArr = outputArrPtr; + model_result_ptr->deviceType = ihInfo->deviceType; } - TensorDesc srcDesc = cnn->get_tensor_desc_by_name(name[i]); - TensorDesc2DataDesc(srcDesc, &outputArrPtr[i]); } - model_result_ptr->num_outputs = model_num_outputs; - model_result_ptr->outputArr = outputArrPtr; - model_result_ptr->deviceType = ihInfo->deviceType; - UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p) end.\n", __FUNCTION__, model_result_ptr); + print_result_handle(model_result_ptr); return (void *)model_result_ptr; } void SetRuntimeDevice(ModelHandle ih, int cpu_id, DEVICE_TYPE device) { - UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p, %d, %d)...\n", __FUNCTION__, ih, cpu_id, device); ModelHandleInner *ihInfo = (ModelHandleInner *)ih; assert_not_nullptr(__FUNCTION__, "ModelHandle", ihInfo); + print_model_handle(ihInfo); CNN *cnn = (CNN *)ihInfo->cnn; assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn); cnn->set_runtime_device(cpu_id, DEVICE_TYPE2Arch(device)); @@ -648,9 +728,10 @@ void SetRuntimeDevice(ModelHandle ih, int cpu_id, DEVICE_TYPE device) void SetRuntimeDeviceDynamic(ModelHandle ih) { - UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p)...\n", __FUNCTION__, ih); ModelHandleInner *ihInfo = (ModelHandleInner *)ih; assert_not_nullptr(__FUNCTION__, "ModelHandle", ihInfo); + print_model_handle(ihInfo); CNN *cnn = (CNN *)ihInfo->cnn; assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn); cnn->set_runtime_device_dynamic(); @@ -660,20 +741,22 @@ void SetRuntimeDeviceDynamic(ModelHandle ih) void SetNumThreads(int threadNum) { - UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%d)...\n", __FUNCTION__, threadNum); set_cpu_num_threads(threadNum); UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__); } void RunModel(ModelHandle ih, ResultHandle ir, int num_inputs, const char **name, void **data) { - UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p, %p, %d, %p, %p)...\n", __FUNCTION__, ih, ir, num_inputs, name, data); ModelHandleInner *ihInfo = (ModelHandleInner *)ih; assert_not_nullptr(__FUNCTION__, "ModelHandle", ihInfo); + print_model_handle(ihInfo); CNN *cnn = (CNN *)ihInfo->cnn; assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn); ResultHandleInner *ir_inner = (ResultHandleInner *)ir; assert_not_nullptr(__FUNCTION__, "ResultHandle", ir_inner); + print_result_handle(ir_inner); if (num_inputs > 0) { assert_not_nullptr(__FUNCTION__, NAME_VALUE_PAIR(name)); assert_not_nullptr(__FUNCTION__, NAME_VALUE_PAIR(data)); @@ -720,11 +803,16 @@ void RunModel(ModelHandle ih, ResultHandle ir, int num_inputs, const char **name int GetNumOutputsFromResultHandle(ResultHandle ir) { - UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p)...\n", __FUNCTION__, ir); ResultHandleInner *ir_inner = (ResultHandleInner *)ir; assert_not_nullptr(__FUNCTION__, "ResultHandle", ir_inner); - UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__); - return ir_inner->num_outputs; + print_result_handle(ir_inner); + int ret = 0; + if (ir_inner != nullptr) { + ret = ir_inner->num_outputs; + } + UNI_DEBUG_LOG("C API %s(%d) end.\n", __FUNCTION__, ret); + return ret; } void GetOutputDataInfoFromResultHandle(ResultHandle ir, @@ -737,13 +825,15 @@ void GetOutputDataInfoFromResultHandle(ResultHandle ir, DATA_TYPE *dt, DATA_FORMAT *df) { - UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p, %d, %p, %p, %p, %p, %p, %p, %p)...\n", __FUNCTION__, ir, + num_outputs, name, n, c, h, w, dt, df); if (num_outputs <= 0) { UNI_WARNING_LOG("C API %s received num_outputs = 0.\n", __FUNCTION__); return; } ResultHandleInner *ir_inner = (ResultHandleInner *)ir; assert_not_nullptr(__FUNCTION__, "ResultHandle", ir_inner); + print_result_handle(ir_inner); if (num_outputs != (int)ir_inner->num_outputs) { UNI_ERROR_LOG("C API %s received num_outputs %d != num_outputs %d in ResultHandle.\n", __FUNCTION__, num_outputs, ir_inner->num_outputs); @@ -759,7 +849,7 @@ void GetOutputDataInfoFromResultHandle(ResultHandle ir, assert_not_nullptr(__FUNCTION__, NAME_VALUE_PAIR(dt)); assert_not_nullptr(__FUNCTION__, NAME_VALUE_PAIR(df)); for (int i = 0; i < num_outputs; i++) { - strcpy(name[i], outputArrPtr[i].name); + UNI_STRCPY(name[i], outputArrPtr[i].name); dt[i] = DataType2DATA_TYPE(outputArrPtr[i].dt); df[i] = DataFormat2DATA_FORMAT(outputArrPtr[i].df); n[i] = outputArrPtr[i].dims[0]; @@ -772,13 +862,14 @@ void GetOutputDataInfoFromResultHandle(ResultHandle ir, void GetOutputDataFromResultHandle(ResultHandle ir, int num_outputs, void **data) { - UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p, %d, %p)...\n", __FUNCTION__, ir, num_outputs, data); if (num_outputs <= 0) { UNI_WARNING_LOG("C API %s received num_outputs = 0.\n", __FUNCTION__); return; } ResultHandleInner *ir_inner = (ResultHandleInner *)ir; assert_not_nullptr(__FUNCTION__, "ResultHandle", ir_inner); + print_result_handle(ir_inner); if (num_outputs != (int)ir_inner->num_outputs) { UNI_ERROR_LOG("C API %s received num_outputs %d != num_outputs %d in ResultHandle.\n", __FUNCTION__, num_outputs, ir_inner->num_outputs); @@ -795,58 +886,80 @@ void GetOutputDataFromResultHandle(ResultHandle ir, int num_outputs, void **data ResultHandle CloneResultHandle(ResultHandle ir) { - UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p)...\n", __FUNCTION__, ir); ResultHandleInner *ir_inner = (ResultHandleInner *)ir; assert_not_nullptr(__FUNCTION__, "ResultHandle", ir_inner); - ResultHandleInner *clone_ir_inner = (ResultHandleInner *)malloc(sizeof(ResultHandleInner)); - *clone_ir_inner = *ir_inner; - U32 size = sizeof(DataDesc) * clone_ir_inner->num_outputs; - if (size > 0) { - clone_ir_inner->outputArr = (DataDesc *)malloc(size); - DataDesc *outputArrPtr = ir_inner->outputArr; - assert_not_nullptr(__FUNCTION__, "ResultHandle.outputArr", outputArrPtr); - memcpy(clone_ir_inner->outputArr, outputArrPtr, size); - } else { - clone_ir_inner->outputArr = nullptr; + print_result_handle(ir_inner); + ResultHandleInner *clone_ir_inner = nullptr; + if (ir_inner != nullptr) { + clone_ir_inner = (ResultHandleInner *)UNI_MALLOC(sizeof(ResultHandleInner)); + *clone_ir_inner = *ir_inner; + U32 size = sizeof(DataDesc) * clone_ir_inner->num_outputs; + if (size > 0) { + clone_ir_inner->outputArr = (DataDesc *)UNI_MALLOC(size); + DataDesc *outputArrPtr = ir_inner->outputArr; + assert_not_nullptr(__FUNCTION__, "ResultHandle.outputArr", outputArrPtr); + UNI_MEMCPY(clone_ir_inner->outputArr, outputArrPtr, size); + } else { + clone_ir_inner->outputArr = nullptr; + } } - UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p) end.\n", __FUNCTION__, clone_ir_inner); + print_result_handle(clone_ir_inner); return (ResultHandle)clone_ir_inner; } void FreeResultHandle(ResultHandle ir) { - UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p)...\n", __FUNCTION__, ir); ResultHandleInner *ir_inner = (ResultHandleInner *)ir; assert_not_nullptr(__FUNCTION__, "ResultHandle", ir_inner); - DataDesc *outputArrPtr = ir_inner->outputArr; - if (ir_inner->num_outputs > 0) { - assert_not_nullptr(__FUNCTION__, "ResultHandle.outputArr", outputArrPtr); - free(outputArrPtr); - ir_inner->num_outputs = 0; + print_result_handle(ir_inner); + if (ir_inner != nullptr) { + DataDesc *outputArrPtr = ir_inner->outputArr; + if (ir_inner->num_outputs > 0) { + assert_not_nullptr(__FUNCTION__, "ResultHandle.outputArr", outputArrPtr); + ir_inner->num_outputs = 0; + } + if (outputArrPtr != nullptr) { + UNI_FREE(outputArrPtr); + ir_inner->outputArr = nullptr; + } + UNI_FREE(ir_inner); } - (*ir_inner).outputArr = nullptr; - free(ir_inner); UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__); } void DestroyModel(ModelHandle ih) { - UNI_DEBUG_LOG("C API %s...\n", __FUNCTION__); + UNI_DEBUG_LOG("C API %s(%p)...\n", __FUNCTION__, ih); ModelHandleInner *ihInfo = (ModelHandleInner *)ih; assert_not_nullptr(__FUNCTION__, "ModelHandle", ihInfo); - - CNN *cnn = (CNN *)ihInfo->cnn; - assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn); - - if (nullptr != ihInfo->algoPath && !ihInfo->useFileStream) { - const char *algoPath = (const char *)ihInfo->algoPath; - UNI_THREAD_SAFE(cnn->saveAlgorithmMapToFile(algoPath)); + print_model_handle(ihInfo); + if (ihInfo != nullptr) { + CNN *cnn = (CNN *)ihInfo->cnn; + assert_not_nullptr(__FUNCTION__, "ModelHandle.cnn", cnn); + if (cnn != nullptr) { + //if (ihInfo->algoPath != nullptr && !ihInfo->useFileStream) { + // const char *algoPath = (const char *)ihInfo->algoPath; + // UNI_THREAD_SAFE(cnn->saveAlgorithmMapToFile(algoPath)); + //} + delete cnn; + ihInfo->cnn = nullptr; + } + delete ihInfo; } - - delete cnn; - ihInfo->cnn = nullptr; - delete ihInfo; UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__); } +void MemoryCheck() +{ + UNI_DEBUG_LOG("C API %s()...\n", __FUNCTION__); +#ifndef _USE_MEM_CHECK + UNI_WARNING_LOG("please set USE_MEM_CHECK to ON at common/cmakes/bolt.cmake, and rebuild " + "library.\n"); +#endif + UNI_MEM_STATISTICS(); + UNI_DEBUG_LOG("C API %s end.\n", __FUNCTION__); +} #undef NAME_VALUE_PAIR diff --git a/inference/engine/src/bolt_dllite.cpp b/inference/engine/src/bolt_dllite.cpp index ae2eb36e..140637df 100644 --- a/inference/engine/src/bolt_dllite.cpp +++ b/inference/engine/src/bolt_dllite.cpp @@ -376,7 +376,7 @@ bolt::ResultHandle bolt::AllocResult( for (size_t i = 0; i < outputs.size(); i++) { U32 length = outputs[i].name.length(); outputNames[i] = (char *)malloc(length + 1); - memcpy(outputNames[i], outputs[i].name.c_str(), length); + UNI_MEMCPY(outputNames[i], outputs[i].name.c_str(), length); outputNames[i][length] = '\0'; } bolt::ResultHandle rh = (bolt::ResultHandle)AllocSpecificResultHandle( diff --git a/inference/engine/src/cnn.cpp b/inference/engine/src/cnn.cpp index 30d8e50b..56e8ec36 100644 --- a/inference/engine/src/cnn.cpp +++ b/inference/engine/src/cnn.cpp @@ -11,6 +11,7 @@ // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +#include #include "cnn.h" #ifdef _USE_CPU #include "cpu/factory_cpu.hpp" @@ -33,6 +34,22 @@ bool is_same_tensor(Tensor a, Tensor b) return ret; } +void CNN::check_dynamic_output_size(OperatorType type) +{ + std::set types = {OT_Shape, OT_NonMaxSuppression}; + if (types.find(type) != types.end()) { + this->dynamicOutputSize = true; + } + if (type == OT_Shape) { + UNI_WARNING_LOG("model contains Shape operator, this will use dynamic output size " + "inference(may encounter error). If you don't want to use it, you can use " + "onnx-simplifier to simplify original onnx model.\n"); + if (IS_GPU(this->deviceInfo.schedule)) { + UNI_ERROR_LOG("gpu currently not support dynamic output size inference.\n"); + } + } +} + CNN CNN::clone() { CNN cnn = *this; @@ -166,8 +183,7 @@ void CNN::initialize_ops(const ModelSpec *ms) std::shared_ptr factory; if (IS_GPU(this->deviceInfo.schedule)) { #ifdef _USE_GPU - auto factory_ocl = (Factory *)(new FactoryOCL()); - factory = std::shared_ptr(factory_ocl); + factory = std::shared_ptr(new FactoryOCL()); this->tmpTensor = Tensor(OCLMem); #else UNI_ERROR_LOG("This library not support ARM GPU, please rebuild library with --gpu " @@ -175,17 +191,18 @@ void CNN::initialize_ops(const ModelSpec *ms) exit(1); #endif } else { - auto factory_cpu = (Factory *)(new FactoryCPU()); - factory = std::shared_ptr(factory_cpu); + factory = std::shared_ptr(new FactoryCPU()); this->tmpTensor = Tensor(); } for (int i = 0; i < opNum; i++) { OperatorSpec curOps = ms->ops[i]; std::string opName = curOps.name; + UNI_DEBUG_LOG("create operator:%s type:%s.\n", curOps.name, OperatorTypeName()[curOps.type]); if (opName.compare("data") == 0) { continue; } + this->check_dynamic_output_size(curOps.type); std::vector inputTensorsName; std::vector outputTensorsName; for (U32 j = 0; j < curOps.num_inputs; j++) { @@ -212,6 +229,7 @@ void CNN::initialize_ops(const ModelSpec *ms) for (int i = 0; i < ms->num_weight_specs; i++) { WeightSpec curOpWs = ms->ws[i]; std::string opName = curOpWs.op_name; + UNI_DEBUG_LOG("set operator:%s's weight parameter.\n", curOpWs.op_name); if (this->operatorMap.find(opName) == this->operatorMap.end()) { UNI_WARNING_LOG("unsed weight %s in model.\n", opName.c_str()); continue; @@ -238,7 +256,7 @@ void CNN::ready(std::map inputDescMap) if (op->is_weight()) { UNI_DEBUG_LOG("op: %s init weight\n", op->get_name().c_str()); auto weightOpPtr = dynamic_cast(op.get()); - CHECK_STATUS(weightOpPtr->init_weight_bias_from_model(nullptr)); + CHECK_STATUS(weightOpPtr->init_weight_bias_from_model()); } UNI_DEBUG_LOG("op: %s infer forward algorithm\n", op->get_name().c_str()); //need process for qualcomm @@ -277,7 +295,6 @@ void CNN::reready(std::map inputDescMap) EE CNN::mark_input_output() { - EE ret = SUCCESS; for (auto &iter : this->inputTensors) { std::string str = iter.first; if (tensorMap.find(str) != tensorMap.end()) { @@ -285,8 +302,7 @@ EE CNN::mark_input_output() } else { UNI_ERROR_LOG( "can not find tensor(name: %s) to be marked as model input.\n", str.c_str()); - ret = NOT_MATCH; - break; + return NOT_MATCH; } } for (auto &iter : this->outputTensors) { @@ -297,11 +313,10 @@ EE CNN::mark_input_output() UNI_ERROR_LOG("can not find tensor(name: %s) to be marked as model output. Maybe this " "tensor is removed by graph optimizer.\n", str.c_str()); - ret = NOT_MATCH; - break; + return NOT_MATCH; } } - return ret; + return SUCCESS; } void CNN::set_input_by_copy(std::map modelTensorsInput) @@ -312,14 +327,16 @@ void CNN::set_input_by_copy(std::map modelTensorsInput) UNI_DEBUG_LOG(" Copy input %s...\n", inputName.c_str()); U8 *data = modelTensorInput.second; if (this->inputTensors.find(inputName) == this->inputTensors.end()) { - CHECK_STATUS(NOT_MATCH); + UNI_ERROR_LOG("Can not find input:%s to set.\n", inputName.c_str()); + return; } auto tensorPtr = this->inputTensors[inputName]; Tensor input; input.resize(tensorPtr->get_desc()); std::shared_ptr shared_data(data, [](U8 *ptr) {}); ((CpuMemory *)(input.get_memory()))->set_shared_ptr(shared_data); - tensorPtr->copy_from(&input); + UNI_PROFILE( + { tensorPtr->copy_from(&input); }, "copy " + inputName, std::string("input::copy")); UNI_DEBUG_LOG(" Copy input: %s %s\n", inputName.c_str(), tensorPtr->string(8).c_str()); } UNI_DEBUG_LOG("Copy input end.\n"); @@ -332,15 +349,20 @@ void CNN::set_input_by_assign(std::map> modelTe std::string inputName = modelTensorInput.first; std::shared_ptr data = modelTensorInput.second; if (this->inputTensors.find(inputName) == this->inputTensors.end()) { - CHECK_STATUS(NOT_MATCH); + UNI_ERROR_LOG("Can not find input:%s to set.\n", inputName.c_str()); + return; } auto tensorPtr = this->inputTensors[inputName]; - if (data != ((CpuMemory *)(tensorPtr->get_memory()))->get_shared_ptr()) { - Tensor input; - input.resize(tensorPtr->get_desc()); - ((CpuMemory *)(input.get_memory()))->set_shared_ptr(data); - tensorPtr->reuse(&input); - } + UNI_PROFILE( + { + if (data != ((CpuMemory *)(tensorPtr->get_memory()))->get_shared_ptr()) { + Tensor input; + input.resize(tensorPtr->get_desc()); + ((CpuMemory *)(input.get_memory()))->set_shared_ptr(data); + tensorPtr->reuse(&input); + } + }, + "copy " + inputName, std::string("input::copy")); UNI_DEBUG_LOG(" Set input: %s %s\n", inputName.c_str(), tensorPtr->string(8).c_str()); } UNI_DEBUG_LOG("Set input end.\n"); @@ -371,10 +393,13 @@ std::map> CNN::get_output() Tensor CNN::get_tensor_by_name(std::string tensorName) { + Tensor ret; if (this->tensorMap.find(tensorName) == this->tensorMap.end()) { - CHECK_STATUS(NOT_MATCH); + UNI_ERROR_LOG("Can not find output:%s to get.\n", tensorName.c_str()); + } else { + ret = *(this->tensorMap[tensorName].get()); } - return *(this->tensorMap[tensorName].get()); + return ret; } TensorDesc CNN::get_tensor_desc_by_name(std::string tensorName) @@ -404,6 +429,52 @@ std::map CNN::get_output_desc() return descs; } +void CNN::update_tensor_positions() +{ + std::unordered_map m; + for (auto &opName : this->sortedOps) { + auto op = this->operatorMap[opName]; + if (op->get_type() == OT_Reshape) { + std::vector curOpInputTensorName = this->operatorTensorMap[opName][0]; + std::vector curOpOutputTensorName = this->operatorTensorMap[opName][1]; + auto tensor = this->tensorMap[curOpInputTensorName[0]]; + if ((tensor->get_desc().df != DF_NCHWC8) && + (tensor->get_desc().df != DF_NCHWC16)) + { + std::vector tensorPositions = op->get_tensor_positions(); + m[curOpInputTensorName[0]] = m[curOpOutputTensorName[0]] = tensorPositions[0] = -1; + tensorPositions[1] = -3; + // when slot is -3, reuse the input tensor mem. + op->set_tensor_positions(tensorPositions); + } + } + } + if (!m.empty()) { + for (auto &opName : this->sortedOps) { + auto op = this->operatorMap[opName]; + std::vector tensorPositions = op->get_tensor_positions(); + if (tensorPositions.size() > 1 && tensorPositions[1] == -3) { + continue; + } + bool update = false; + for (U32 i = 0, tensorIter = 0; i < 2; ++i) { + U32 iterSize = this->operatorTensorMap[opName][i].size(); + for (U32 j = 0; j < iterSize; ++j) { + std::string tensorName = this->operatorTensorMap[opName][i][j]; + if (m.count(tensorName)) { + tensorPositions[tensorIter] = m[tensorName]; + update = true; + } + ++tensorIter; + } + } + if (update) { + op->set_tensor_positions(tensorPositions); + } + } + } +} + EE CNN::infer_output_tensors_size(std::map inputDescMap) { UNI_DEBUG_LOG("Infer tensor dimension...\n"); @@ -413,6 +484,9 @@ EE CNN::infer_output_tensors_size(std::map inputDescMap "model input: %s desc %s\n", iter.first.c_str(), tensorDesc2Str(iter.second).c_str()); } this->infer_layout_desc(); +#ifndef _USE_GPU + this->update_tensor_positions(); +#endif this->update_op_tensors(); UNI_DEBUG_LOG("Infer tensor dimension end.\n"); return SUCCESS; @@ -452,7 +526,7 @@ void CNN::assign_output_tensor() // tensorPositions[tensorIter]); auto tensor = this->tensorMap[tensorName]; bool needAssign = true; - if (i == 0 && this->inputTensors.find(tensorName) == this->inputTensors.end()) { + if (i == 0 && (this->inputTensors.find(tensorName) == this->inputTensors.end())) { needAssign = false; } if (this->weightOpOutputNames.find(tensorName) != this->weightOpOutputNames.end()) { @@ -462,6 +536,7 @@ void CNN::assign_output_tensor() I32 slot = tensorPositions[tensorIter]; if (slot >= 0) { tensor->reuse(get_reuse_memory(slot, tensor.get())); + } else if (slot == -1) { tensor->alloc(); #ifdef _USE_GPU @@ -469,6 +544,9 @@ void CNN::assign_output_tensor() auto mem = (OclMemory *)tensor->get_memory(); mem->mapped_alloc(); #endif + } else if (slot == -3) { + // when slot is -3, reuse the input tensor mem. + tensor->reuse(&(tensors[0][0])); } } tensorIter++; @@ -489,7 +567,37 @@ void CNN::run() if (op->get_type() == OT_Repeat || op->get_type() == OT_Jump) { opIndex = op->get_next_operator_index(); } else { - UNI_PROFILE(op->run(), op->get_name(), + if (this->dynamicOutputSize) { + std::vector inputs = op->get_input_tensors(); + std::vector outputs = op->get_output_tensors(); + std::vector in, out; + for (U32 i = 0; i < inputs.size(); i++) { + in.push_back(&inputs[i]); + } + for (U32 i = 0; i < outputs.size(); i++) { + out.push_back(&outputs[i]); + } + op->infer_output_tensors_size(in, out); + } +#ifdef _DEBUG + std::vector inputTensors = op->get_input_tensors(); + std::vector inputNames = operatorTensorMap[op->get_name()][0]; + for (U32 i = 0; i < inputTensors.size(); i++) { + Tensor inputTensor = inputTensors[i]; + std::string line = inputTensor.string(8); + UNI_DEBUG_LOG(" input:%s %s\n", inputNames[i].c_str(), line.c_str()); + } +#endif + UNI_PROFILE( + { + op->run(); +#if defined(_USE_GPU) && defined(_PROFILE) + if (IS_GPU(this->deviceInfo.schedule)) { + gcl_finish(OCLContext::getInstance().handle.get()); + } +#endif + }, + op->get_name(), std::string(OperatorTypeName()[op->get_type()]) + std::string("::run")); opIndex++; } @@ -562,7 +670,7 @@ void CNN::set_op_tensors_positions(std::shared_ptr op, U32 outputTensorsNum = outputTensorsName.size(); U32 numTensors = inputTensorsNum + outputTensorsNum; std::vector tensorPositions(numTensors); - memcpy(tensorPositions.data(), tensor_positions, numTensors * bytesOf(DT_I32)); + UNI_MEMCPY(tensorPositions.data(), tensor_positions, numTensors * bytesOf(DT_I32)); if (IS_GPU(this->deviceInfo.schedule)) { for (U32 j = 0; j < numTensors; j++) { std::string curTensorName; @@ -636,7 +744,7 @@ void CNN::set_input_desc(std::map inputDescMap) { for (auto &iter : inputDescMap) { if (tensorMap.find(iter.first) == tensorMap.end()) { - UNI_WARNING_LOG("Unused model input node: %s\n", iter.first.c_str()); + UNI_WARNING_LOG("unused model input node: %s\n", iter.first.c_str()); continue; } TensorDesc desc = iter.second; diff --git a/inference/engine/src/data_loader.cpp b/inference/engine/src/data_loader.cpp index a7860250..c5769ae2 100644 --- a/inference/engine/src/data_loader.cpp +++ b/inference/engine/src/data_loader.cpp @@ -18,6 +18,7 @@ #include "data_loader.hpp" #include #include +#include #ifdef _BUILD_TEST #include @@ -50,7 +51,7 @@ std::vector load_jpeg( info.out_color_space); CHECK_REQUIREMENT(2 == info.out_color_space); // Support RGB for now - U8 *data = (U8 *)malloc(dataSize); + U8 *data = (U8 *)UNI_MALLOC(dataSize); JSAMPROW row_pointer[1]; while (info.output_scanline < info.output_height) { row_pointer[0] = data + info.output_scanline * width * numChannels; @@ -76,7 +77,7 @@ std::vector load_jpeg( b[i] = dataMov[2]; dataMov += numChannels; } - free(data); + UNI_FREE(data); std::shared_ptr imageTensor = load_resize_image(rgbTensor, imageDesc[0], ImageFormat, scaleValue); @@ -130,7 +131,8 @@ void get_files(std::string directoryName, std::vector &files) } struct dirent *file; while ((file = readdir(directory)) != NULL) { - if (strcmp(file->d_name, ".") == 0 || strcmp(file->d_name, "..") == 0) { + if (std::string(file->d_name) == std::string(".") || + std::string(file->d_name) == std::string("..")) { continue; } struct stat st; @@ -146,7 +148,7 @@ void get_files(std::string directoryName, std::vector &files) closedir(directory); } -Tensor fscanfReadData(FILE *f, TensorDesc desc) +Tensor readFileData(std::ifstream &file, TensorDesc desc) { Tensor tensor = Tensor::alloc_sized(desc); U32 size = tensor.length(); @@ -156,41 +158,37 @@ Tensor fscanfReadData(FILE *f, TensorDesc desc) case DT_F32: { F32 *dataPtr = (F32 *)ptr; for (U32 i = 0; i < size; i++) { - fscanf(f, "%f", dataPtr + i); + file >> dataPtr[i]; } break; } -#ifdef __aarch64__ +#ifdef _USE_FP16 case DT_F16: { F16 *dataPtr = (F16 *)ptr; F32 value; for (U32 i = 0; i < size; i++) { - fscanf(f, "%f", &value); + file >> value; dataPtr[i] = (F16)value; } break; } #endif case DT_U32: { - F32 value = 0; U32 *dataPtr = (U32 *)ptr; for (U32 i = 0; i < size; i++) { - fscanf(f, "%f", &value); - dataPtr[i] = value; + file >> dataPtr[i]; } break; } case DT_I32: { - F32 value = 0; I32 *dataPtr = (I32 *)ptr; for (U32 i = 0; i < size; i++) { - fscanf(f, "%f", &value); - dataPtr[i] = value; + file >> dataPtr[i]; } break; } default: - CHECK_STATUS(NOT_SUPPORTED); + UNI_ERROR_LOG("not support to read %s type data.\n", DataTypeName()[dataType]); break; } return tensor; @@ -211,31 +209,37 @@ std::vector load_fake_data(std::vector dataDesc) std::vector load_txt(std::string dataPath, std::vector dataDesc) { std::vector result; - FILE *f = fopen(dataPath.c_str(), "r"); - CHECK_REQUIREMENT(f != nullptr); + std::ifstream file; + file.open(dataPath.c_str()); + if (!file.is_open()) { + UNI_ERROR_LOG("can not read %s.\n", dataPath.c_str()); + } for (U32 index = 0; index < dataDesc.size(); index++) { - result.push_back(fscanfReadData(f, dataDesc[index])); + result.push_back(readFileData(file, dataDesc[index])); } - fclose(f); + file.close(); return result; } std::vector load_seq(std::string dataPath, std::vector dataDesc) { std::vector result; - FILE *f = fopen(dataPath.c_str(), "r"); - CHECK_REQUIREMENT(f != nullptr); + std::ifstream file; + file.open(dataPath.c_str()); + if (!file.is_open()) { + UNI_ERROR_LOG("can not read %s.\n", dataPath.c_str()); + } for (U32 index = 0; index < dataDesc.size(); index++) { U32 sequenceLen = 0; - fscanf(f, "%u", &sequenceLen); + file >> sequenceLen; TensorDesc sequenceDesc = dataDesc[index]; sequenceDesc.dims[0] = sequenceLen; for (U32 j = 1; j < sequenceDesc.nDims; j++) { sequenceDesc.dims[j] = 1; } - result.push_back(fscanfReadData(f, sequenceDesc)); + result.push_back(readFileData(file, sequenceDesc)); } - fclose(f); + file.close(); return result; } @@ -253,21 +257,15 @@ std::vector load_bin( Tensor tensor = Tensor::alloc_sized(sourceDesc); U32 len = tensor.length(); auto ptr = ((CpuMemory *)(tensor.get_memory()))->get_ptr(); - U32 readLength = fread(ptr, bytesOf(sourceDataType[index]), len, f); - CHECK_REQUIREMENT(len == readLength); + CHECK_REQUIREMENT(fread(ptr, bytesOf(sourceDataType[index]), len, f) == len); if (sourceDataType[index] != dataDesc[index].dt) { Tensor transform_tensor = Tensor::alloc_sized(dataDesc[index]); - if (0) { -#ifdef __aarch64__ - } else if (sourceDataType[index] == DT_F32 && dataDesc[index].dt == DT_F16) { - F32 *ptr1 = (F32 *)ptr; - F16 *ptr2 = (F16 *)((CpuMemory *)(transform_tensor.get_memory()))->get_ptr(); - for (U32 i = 0; i < len; i++) { - ptr2[i] = (F16)ptr1[i]; - } -#endif + if (sourceDataType[index] == DT_F32) { + transformFromFloat(dataDesc[index].dt, (const float *)ptr, + ((CpuMemory *)(transform_tensor.get_memory()))->get_ptr(), len); } else { - CHECK_STATUS(NOT_SUPPORTED); + UNI_ERROR_LOG("not support to read+transform %s data.\n", + DataTypeName()[sourceDataType[index]]); } result.push_back(transform_tensor); } else { @@ -315,3 +313,21 @@ std::vector load_data(std::string directoryPath, } return dataPaths; } + +bool is_directory(std::string path) +{ + bool ret = false; + struct stat s; + if (stat(path.c_str(), &s) == 0) { + if (s.st_mode & S_IFDIR) { + ret = true; + } else if (s.st_mode & S_IFREG) { + ret = false; + } else { + UNI_ERROR_LOG("can not recognize %s.\n", path.c_str()); + } + } else { + UNI_ERROR_LOG("%s is not exist.\n", path.c_str()); + } + return ret; +} diff --git a/inference/engine/src/model.cpp b/inference/engine/src/model.cpp new file mode 100644 index 00000000..5fa2c599 --- /dev/null +++ b/inference/engine/src/model.cpp @@ -0,0 +1,168 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#include "model.hpp" +#include "thread_affinity.h" + +Model::Model(AffinityPolicy affinityPolicy, DataType dt, std::string name) +{ + this->set_device_info(affinityPolicy); + this->dt = dt; + this->name = name; + std::string deviceName = ""; + if (IS_GPU(this->deviceInfo.schedule)) { +#ifdef _USE_GPU + if (OCLContext::getInstance().handle->useQualcommDev) { + this->deviceInfo.schedule = QUALCOMM; + } +#else + UNI_ERROR_LOG("This library not support ARM MALI/Qualcomm GPU, please rebuild library " + "with --gpu option.\n"); + exit(1); +#endif + } + algorithmMap = std::shared_ptr( + new AlgorithmMap(this->deviceInfo.schedule, name, deviceName, dt)); +} + +void Model::set_runtime_device(int cpuId, int threadId) +{ + this->set_runtime_device(cpuId, this->deviceInfo.archs[cpuId], threadId); +} + +void Model::set_runtime_device(int cpuId, Arch arch, int threadId) +{ + this->deviceInfo.schedule = arch; + UNI_DEBUG_LOG("Inference use %s.\n", ArchName()[this->deviceInfo.schedule]) + if (cpuId >= 0 && cpuId < this->deviceInfo.cpuNum) { + set_thread_affinity(threadId, &cpuId, 1); + for (auto op : ops) { + op->set_schedule(this->deviceInfo.schedule); + } + } +} + +void Model::set_runtime_device_dynamic(int threadId) +{ + set_cpu_dynamic(&this->deviceInfo, threadId); +} + +Arch Model::get_runtime_device() +{ + return this->deviceInfo.schedule; +} + +void Model::ready(std::map inputDescMap) +{ + infer_output_tensors_size(inputDescMap); + assign_output_tensor(); + + infer_tmp_memory_size(); + assign_tmp_tensor(); +} + +#ifdef _USE_INT8 +U32 Model::find_next_dynamic_scale_op(std::vector calibratedOpIdx, U32 startIdx) +{ + CHECK_REQUIREMENT(startIdx < this->ops.size()) + for (U32 i = startIdx; i < this->ops.size();) { + auto op = this->ops[i]; + if (op->is_dynamic_scale()) { + bool calibrated = false; + for (auto idx : calibratedOpIdx) { + if (i == idx) { + calibrated = true; + break; + } + } + if (!calibrated) { + return i; + } + } + + if (op->get_type() == OT_Repeat || op->get_type() == OT_Jump) { + i = op->get_next_operator_index(); + } else { + i++; + } + } + + return 0; // The first layer should never be quantized +} + +std::shared_ptr Model::get_operator_by_index(U32 index) +{ + return this->ops[index]; +} + +void Model::run_till_breakpoint(U32 opIdx) +{ + CHECK_REQUIREMENT(IS_CPU(this->deviceInfo.schedule)); + for (U32 i = 0; i < this->ops.size();) { + auto op = this->ops[i]; + if (op->get_type() == OT_Repeat || op->get_type() == OT_Jump) { + if (opIdx == i) { + break; + } + i = op->get_next_operator_index(); + } else { + op->run(); + if (opIdx == i) { + break; + } + i++; + } + } +} +#endif + +std::string Model::get_name() +{ + return this->name; +} + +void Model::loadAlgorithmMap(CI8 *path, bool useFileStream) +{ + std::string algoName = this->algorithmMap->getAlgorithmFileName(); + CI8 *algoInfo = nullptr; + if (IS_GPU(this->deviceInfo.schedule)) { +#ifdef _USE_GPU + algoInfo = gcl_get_algorithm_info(OCLContext::getInstance().handle.get(), algoName); +#endif + } + if (!algoInfo && useFileStream) { + algoInfo = path; + } + if (algoInfo) { + this->algorithmMap->loadAlgorithmMapFromFileStream(algoInfo); + } else if (path) { + this->algorithmMap->loadAlgorithmMapFromFile(path); + } +} + +void Model::saveAlgorithmMapToFile(std::string algorithmMapPath) +{ + this->algorithmMap->saveAlgorithmMapToFile(algorithmMapPath); +} + +void Model::set_device_info(AffinityPolicy affinityPolicy) +{ +#ifndef _USE_IOS + this->deviceInfo = get_cpu_info(affinityPolicy); + this->set_runtime_device_dynamic(); +#else + this->deviceInfo.affinityPolicy = affinityPolicy; + this->deviceInfo.schedule = ARM_A76; +#endif + UNI_DEBUG_LOG("Inference use %s.\n", ArchName()[this->deviceInfo.schedule]) +} diff --git a/inference/engine/src/model_calibration.cpp b/inference/engine/src/model_calibration.cpp index bedc2da8..3acde351 100644 --- a/inference/engine/src/model_calibration.cpp +++ b/inference/engine/src/model_calibration.cpp @@ -287,7 +287,7 @@ void calibrate_model_with_dataset(std::string dataPath, tensorSize.push_back( tensorNumElements(resizedTensors[tensorPosition[i].second].get_desc())); dBytes = tensorSize.back() * elementBytes; - memcpy(d, + UNI_MEMCPY(d, ((CpuMemory *)(resizedTensors[tensorPosition[i].second].get_memory()))->get_ptr(), dBytes); @@ -347,13 +347,13 @@ void calibrate_model_with_dataset(std::string dataPath, resultMs->ops[opIdx].num_quant_feature = scales.size(); resultMs->ops[opIdx].feature_scale = - (QuantSpec *)mt_new_storage(scales.size() * sizeof(QuantSpec)); + (QuantSpec *)mt_malloc(scales.size() * sizeof(QuantSpec)); for (U32 i = 0; i < scales.size(); i++) { resultMs->ops[opIdx].feature_scale[i].num_scale = scales[i].size(); U32 scaleBytes = scales[i].size() * sizeof(F32); - resultMs->ops[opIdx].feature_scale[i].scale = (F32 *)mt_new_storage(scaleBytes); - memcpy(resultMs->ops[opIdx].feature_scale[i].scale, scales[i].data(), scaleBytes); + resultMs->ops[opIdx].feature_scale[i].scale = (F32 *)mt_malloc(scaleBytes); + UNI_MEMCPY(resultMs->ops[opIdx].feature_scale[i].scale, scales[i].data(), scaleBytes); } calibratedOpIdx.push_back(opIdx); diff --git a/inference/engine/src/result_format.cpp b/inference/engine/src/result_format.cpp index bd7abb67..4eb96155 100644 --- a/inference/engine/src/result_format.cpp +++ b/inference/engine/src/result_format.cpp @@ -23,7 +23,7 @@ std::vector topK_index(U8 *res, TensorDesc desc, U32 topK) } switch (desc.dt) { -#ifdef __aarch64__ +#ifdef _USE_FP16 case DT_F16: { F16 *dataPtr = (F16 *)res; sort(index.begin(), index.end(), diff --git a/inference/engine/tools/common_algo_search/common_algo_search.cpp b/inference/engine/tools/common_algo_search/common_algo_search.cpp index b30ddf20..03f4dbb4 100644 --- a/inference/engine/tools/common_algo_search/common_algo_search.cpp +++ b/inference/engine/tools/common_algo_search/common_algo_search.cpp @@ -55,10 +55,10 @@ int convolutionCPUFloatAlgorithmSearch(Arch arch, DataType dt, std::string path) } convParamSpec.stride_h = sv; convParamSpec.stride_w = sv; - convParamSpec.padding_left = pl; - convParamSpec.padding_right = pr; - convParamSpec.padding_top = pt; - convParamSpec.padding_bottom = pb; + convParamSpec.pad_left = pl; + convParamSpec.pad_right = pr; + convParamSpec.pad_top = pt; + convParamSpec.pad_bottom = pb; filterDesc = tensor4df(dt, DF_NCHW, fn, ic, fv, fv); Tensor inputTensor; Tensor outputTensor; diff --git a/inference/engine/tools/model_finetuner/model_finetuner.cpp b/inference/engine/tools/model_finetuner/model_finetuner.cpp index 3cb0510b..492383e2 100644 --- a/inference/engine/tools/model_finetuner/model_finetuner.cpp +++ b/inference/engine/tools/model_finetuner/model_finetuner.cpp @@ -58,7 +58,7 @@ void load_cifar10(U8 *dataset, U32 batchIdx, TensorDesc inDesc, float *pixels, T archInfo.arch = CPU_GENERAL; if (training) { - memset(labels, 0, BATCH_SIZE * 10 * sizeof(float)); + UNI_MEMSET(labels, 0, BATCH_SIZE * 10 * sizeof(float)); } Tensor tmp, output; output.resize(inDesc); diff --git a/inference/engine/tools/preprocess_ocl/preprocess_ocl.cpp b/inference/engine/tools/preprocess_ocl/preprocess_ocl.cpp index d79d29b0..4e33d216 100644 --- a/inference/engine/tools/preprocess_ocl/preprocess_ocl.cpp +++ b/inference/engine/tools/preprocess_ocl/preprocess_ocl.cpp @@ -17,7 +17,6 @@ #include "result_format.hpp" #include #include -#include #include #include #include @@ -70,52 +69,50 @@ inline void write_to_file(std::string str, std::string path, std::string name) inline void runBoltModel( CI8 *modelPath, CI8 *algoPath, std::map> *kernelInfos) { + UNI_INFO_LOG("Build gpu kernels and algorithm map file for bolt model(%s)...\n", modelPath); if (!strstr(modelPath, "f16.bolt")) { - UNI_ERROR_LOG("Bolt gpu only support F16(_f16.bolt) now\n"); - UNI_ERROR_LOG("Ensure your model is xxxx_f16.bolt\n"); + UNI_ERROR_LOG("Bolt gpu only support float16 inference, and model file is end with " + "_f16.bolt suffix.\n"); exit(1); } - UNI_INFO_LOG("Building algofile and used kernelInfos for %s\n", modelPath); - - ModelHandle model_address = model_address = CreateModel(modelPath, GPU, algoPath); - int num_input = GetNumInputsFromModel(model_address); - int *n = (int *)malloc(sizeof(int) * num_input); - int *c = (int *)malloc(sizeof(int) * num_input); - int *h = (int *)malloc(sizeof(int) * num_input); - int *w = (int *)malloc(sizeof(int) * num_input); - char **name = (char **)malloc(sizeof(char *) * num_input); - for (int i = 0; i < num_input; i++) { - name[i] = (char *)malloc(sizeof(char) * 1024); + ModelHandle model = CreateModel(modelPath, GPU, algoPath); + int input_num = GetNumInputsFromModel(model); + int *input_n = (int *)malloc(sizeof(int) * input_num); + int *input_c = (int *)malloc(sizeof(int) * input_num); + int *input_h = (int *)malloc(sizeof(int) * input_num); + int *input_w = (int *)malloc(sizeof(int) * input_num); + DATA_TYPE *input_dt = (DATA_TYPE *)malloc(sizeof(DATA_TYPE) * input_num); + DATA_FORMAT *input_df = (DATA_FORMAT *)malloc(sizeof(DATA_FORMAT) * input_num); + char **input_name = (char **)malloc(sizeof(char *) * input_num); + for (int i = 0; i < input_num; i++) { + input_name[i] = (char *)malloc(sizeof(char) * 1024); } - DATA_TYPE *dt_input = (DATA_TYPE *)malloc(sizeof(DATA_TYPE) * num_input); - DATA_FORMAT *df_input = (DATA_FORMAT *)malloc(sizeof(DATA_FORMAT) * num_input); - GetInputDataInfoFromModel(model_address, num_input, name, n, c, h, w, dt_input, df_input); - unsigned char **input_ptr = (unsigned char **)malloc(sizeof(unsigned char *) * num_input); - for (int i = 0; i < num_input; i++) { - int length = n[i] * c[i] * h[i] * w[i]; - F16 *ptr = (F16 *)malloc(sizeof(F16) * length); - for (int i = 0; i < length; i++) { - ptr[i] = 1; - } - input_ptr[i] = (unsigned char *)ptr; + GetInputDataInfoFromModel( + model, input_num, input_name, input_n, input_c, input_h, input_w, input_dt, input_df); + unsigned char **input_ptr = (unsigned char **)malloc(sizeof(unsigned char *) * input_num); + for (int i = 0; i < input_num; i++) { + int length = input_n[i] * input_c[i] * input_h[i] * input_w[i]; + input_ptr[i] = (unsigned char *)malloc(sizeof(F16) * length); + UNI_INIT(length, DT_F16, 1, input_ptr[i]); } - PrepareModel(model_address, num_input, (const char **)name, n, c, h, w, dt_input, df_input); - ResultHandle model_result = AllocAllResultHandle(model_address); - int model_result_num = GetNumOutputsFromResultHandle(model_result); - int *output_n = (int *)malloc(sizeof(int) * model_result_num); - int *output_c = (int *)malloc(sizeof(int) * model_result_num); - int *output_h = (int *)malloc(sizeof(int) * model_result_num); - int *output_w = (int *)malloc(sizeof(int) * model_result_num); - char **outputNames = (char **)malloc(sizeof(char *) * model_result_num); - for (int i = 0; i < model_result_num; i++) { - outputNames[i] = (char *)malloc(sizeof(char) * 1024); + PrepareModel(model, input_num, (const char **)input_name, input_n, input_c, input_h, input_w, + input_dt, input_df); + ResultHandle result = AllocAllResultHandle(model); + int output_num = GetNumOutputsFromResultHandle(result); + int *output_n = (int *)malloc(sizeof(int) * output_num); + int *output_c = (int *)malloc(sizeof(int) * output_num); + int *output_h = (int *)malloc(sizeof(int) * output_num); + int *output_w = (int *)malloc(sizeof(int) * output_num); + DATA_TYPE *output_dt = (DATA_TYPE *)malloc(sizeof(DATA_TYPE) * output_num); + DATA_FORMAT *output_df = (DATA_FORMAT *)malloc(sizeof(DATA_FORMAT) * output_num); + char **output_name = (char **)malloc(sizeof(char *) * output_num); + for (int i = 0; i < output_num; i++) { + output_name[i] = (char *)malloc(sizeof(char) * 1024); } - DATA_TYPE *dt_output = (DATA_TYPE *)malloc(sizeof(DATA_TYPE) * model_result_num); - DATA_FORMAT *df_output = (DATA_FORMAT *)malloc(sizeof(DATA_FORMAT) * model_result_num); - GetOutputDataInfoFromResultHandle(model_result, model_result_num, outputNames, output_n, - output_c, output_h, output_w, dt_output, df_output); - RunModel(model_address, model_result, num_input, (const char **)name, (void **)input_ptr); + GetOutputDataInfoFromResultHandle(result, output_num, output_name, output_n, output_c, output_h, + output_w, output_dt, output_df); + RunModel(model, result, input_num, (const char **)input_name, (void **)input_ptr); GCLHandle_t handle = OCLContext::getInstance().handle.get(); for (auto p : handle->kernelMap) { @@ -153,31 +150,31 @@ inline void runBoltModel( } } CHECK_STATUS(gcl_finish(handle)); - FreeResultHandle(model_result); - DestroyModel(model_address); + FreeResultHandle(result); + DestroyModel(model); - free(n); - free(c); - free(h); - free(w); - free(dt_input); - free(df_input); - for (int i = 0; i < num_input; i++) { - free(name[i]); + free(input_n); + free(input_c); + free(input_h); + free(input_w); + free(input_dt); + free(input_df); + for (int i = 0; i < input_num; i++) { + free(input_name[i]); free(input_ptr[i]); } - free(name); + free(input_name); free(input_ptr); free(output_n); free(output_c); free(output_h); free(output_w); - free(dt_output); - free(df_output); - for (int i = 0; i < model_result_num; i++) { - free(outputNames[i]); + free(output_dt); + free(output_df); + for (int i = 0; i < output_num; i++) { + free(output_name[i]); } - free(outputNames); + free(output_name); } inline void buildFileStream(CI8 *fileName, U8 **bytesPtr, U32 *len) @@ -238,6 +235,7 @@ inline void buildKernelBinFiles(std::map> kernelInf device_map += "#include \"gcl_kernel_binmap.h\"\n"; device_map += "#include \"" + device_map_head_name + "\"\n"; + I8 buffer[16]; for (auto p : kernelInfos) { std::string kernelName = p.first; std::vector binaryInfo = p.second; @@ -247,12 +245,11 @@ inline void buildKernelBinFiles(std::map> kernelInf device_map += "const unsigned int " + func + "_len = " + std::to_string(len) + ";\n"; device_map += "const unsigned char " + func + "[] = " + "{"; for (U32 i = 0; i < len; i++) { - I8 tempstr[4]; if (i % 20 == 0) { device_map += "\n"; } - sprintf(tempstr, "0x%02x", binaryInfo[i]); - device_map += std::string(tempstr); + sprintf(buffer, "0x%02x", binaryInfo[i]); + device_map += std::string(buffer); if (i != len - 1) { device_map += ", "; } else { @@ -269,12 +266,11 @@ inline void buildKernelBinFiles(std::map> kernelInf device_map += "const unsigned int " + algoName + "_len = " + std::to_string(len) + ";\n"; device_map += "const unsigned char " + algoName + "[] = " + "{"; for (U32 i = 0; i < len; i++) { - I8 tempstr[4]; if (i % 20 == 0) { device_map += "\n"; } - sprintf(tempstr, "0x%02x", bytes[i]); - device_map += std::string(tempstr); + sprintf(buffer, "0x%02x", bytes[i]); + device_map += std::string(buffer); if (i != len - 1) { device_map += ", "; } else { @@ -333,29 +329,10 @@ int main(int argc, char *argv[]) exit(1); } I8 lastFlag; - std::string modelsPath = (CI8 *)argv[1]; - lastFlag = modelsPath[modelsPath.length() - 1]; - if (strcmp(&lastFlag, "/") != 0) { - modelsPath += "/"; - } - - std::string algoPath = (CI8 *)argv[2]; - lastFlag = algoPath[algoPath.length() - 1]; - if (strcmp(&lastFlag, "/") != 0) { - algoPath += "/"; - } - - std::string includePath = (CI8 *)argv[3]; - lastFlag = includePath[includePath.length() - 1]; - if (strcmp(&lastFlag, "/") != 0) { - includePath += "/"; - } - - std::string cppPath = (CI8 *)argv[4]; - lastFlag = cppPath[cppPath.length() - 1]; - if (strcmp(&lastFlag, "/") != 0) { - cppPath += "/"; - } + std::string modelsPath = (CI8 *)argv[1] + std::string("/"); + std::string algoPath = (CI8 *)argv[2] + std::string("/"); + std::string includePath = (CI8 *)argv[3] + std::string("/"); + std::string cppPath = (CI8 *)argv[4] + std::string("/"); std::vector modelNamesArray; modelNamesArray = buildFileNamesArray(modelsPath, ".bolt"); diff --git a/inference/examples/CMakeLists.txt b/inference/examples/CMakeLists.txt index 17356469..e494d978 100644 --- a/inference/examples/CMakeLists.txt +++ b/inference/examples/CMakeLists.txt @@ -21,13 +21,14 @@ engine_test(benchmark benchmark/benchmark.cpp) install(TARGETS benchmark RUNTIME DESTINATION examples) if (BUILD_TEST) - engine_test(c_image_classifification "c_api/c_image_classifification.c;c_api/c_common.c;c_api/c_test.c") + engine_test(c_image_classification "c_api/c_image_classification.c;c_api/c_common.c;c_api/c_test.c") engine_test(c_input_method "c_api/c_input_method.c;c_api/c_common.c;c_api/c_test.c") engine_test(bert bert/bert.cpp) engine_test(tinybert bert/tinybert.cpp) engine_test(classification image_classification/classification.cpp) engine_test(nmt machine_translation/nmt.cpp) engine_test(nmt_tsc machine_translation/nmt_tsc.cpp) + engine_test(tsc_ssru machine_translation/tsc_ssru.cpp) engine_test(asr_rnnt automatic_speech_recognition/asr_rnnt.cpp) engine_test(asr_convolution_transformer automatic_speech_recognition/asr_convolution_transformer.cpp) engine_test(tts text_to_speech/tts.cpp) @@ -47,11 +48,12 @@ if (BUILD_TEST) tinybert tinybert_onnx nmt + tsc_ssru asr_rnnt asr_convolution_transformer tts vad - c_image_classifification + c_image_classification c_input_method RUNTIME DESTINATION examples) diff --git a/inference/examples/automatic_speech_recognition/flow_asr.h b/inference/examples/automatic_speech_recognition/flow_asr.h index a7ef6401..0fd6a345 100644 --- a/inference/examples/automatic_speech_recognition/flow_asr.h +++ b/inference/examples/automatic_speech_recognition/flow_asr.h @@ -267,7 +267,7 @@ void loadBinary(const std::string fileName, char *data, size_t size) ifs.seekg(0, std::ifstream::beg); ifs.read(data, UNI_MIN(length, size)); if (length < size) { - memset(data + length, 0, size - length); + UNI_MEMSET(data + length, 0, size - length); } ifs.close(); } @@ -352,7 +352,7 @@ std::map> getEncoderInputOutput( case DT_F32: { F32 *ptr = (F32 *)((CpuMemory *)(tensors["sounds"]->get_memory()))->get_ptr(); for (int i = 0; i < frameLength; i++) { - memcpy(ptr + i * featureLength, feature[0][i + frameOffset].data(), + UNI_MEMCPY(ptr + i * featureLength, feature[0][i + frameOffset].data(), featureLength * sizeof(float)); } break; @@ -497,7 +497,7 @@ std::map> getEncoderInputOutput( if (iter.first != std::string("sounds")) { TensorDesc desc = iter.second->get_desc(); U8 *ptr = (U8 *)((CpuMemory *)(iter.second->get_memory()))->get_ptr(); - memset(ptr, 0, tensorNumBytes(desc)); + UNI_MEMSET(ptr, 0, tensorNumBytes(desc)); } } } @@ -606,7 +606,7 @@ std::map> getPinYin2HanZiInputOutput(int fr tensors["pinyin"]->resize(tensor2df(DT_U32, DF_NORMAL, 1, bufferLength)); tensors["pinyin"]->alloc(); if (frameId == 0) { - memset(buffer, 0, sizeof(unsigned int) * bufferLength); + UNI_MEMSET(buffer, 0, sizeof(unsigned int) * bufferLength); } int pinyin = *((unsigned int *)((CpuMemory *)(joint["output_argmax"]->get_memory()))->get_ptr()) - PINYIN_FEATURE_GAP; @@ -620,8 +620,8 @@ std::map> getPinYin2HanZiInputOutput(int fr buffer[bufferValidSize - 1] = pinyin; } unsigned int *ptr = (unsigned int *)((CpuMemory *)(tensors["pinyin"]->get_memory()))->get_ptr(); - memcpy(ptr, buffer, sizeof(unsigned int) * bufferValidSize); - memset(ptr + bufferValidSize, 0, sizeof(unsigned int) * (bufferLength - bufferValidSize)); + UNI_MEMCPY(ptr, buffer, sizeof(unsigned int) * bufferValidSize); + UNI_MEMSET(ptr + bufferValidSize, 0, sizeof(unsigned int) * (bufferLength - bufferValidSize)); tensors["hanzi_squeeze/Squeeze"] = std::shared_ptr(new Tensor()); std::shared_ptr tmp; diff --git a/inference/examples/automatic_speech_recognition/vad.cpp b/inference/examples/automatic_speech_recognition/vad.cpp index 2a327129..05ba0ab4 100644 --- a/inference/examples/automatic_speech_recognition/vad.cpp +++ b/inference/examples/automatic_speech_recognition/vad.cpp @@ -92,7 +92,7 @@ int main(int argc, char *argv[]) std::cout << "output_eoq: " << eoq.element(0) << " " << eoq.element(1) << std::endl; falseResult += verify(vad, eoq); Tensor outCache = pipeline->get_tensor_by_name("output_cache"); - memcpy(cache.data(), (U8 *)((CpuMemory *)(outCache.get_memory()))->get_ptr(), + UNI_MEMCPY(cache.data(), (U8 *)((CpuMemory *)(outCache.get_memory()))->get_ptr(), tensorNumBytes(cacheDesc)); } UNI_TIME_STATISTICS diff --git a/inference/examples/benchmark/benchmark.cpp b/inference/examples/benchmark/benchmark.cpp index 84a179d8..7a1171dd 100644 --- a/inference/examples/benchmark/benchmark.cpp +++ b/inference/examples/benchmark/benchmark.cpp @@ -43,11 +43,14 @@ void print_benchmark_usage() "5. -l [loopTime]: The running loopTimes. The default value is %d.\n" "6. -w [warmUp]: WarmUp times. The default value is %d.\n" "7. -t [threadsNum]: Parallel threads num. The default value is %d.\n" - "Example: ./benchmark -m /local/models/resnet50_f16.bolt\n", + "Example:\n" + " ./benchmark -m /local/models/resnet50_f16.bolt\n" + " ./benchmark -m /local/models/resnet50_f16.bolt -i ./input.txt\n" + " ./benchmark -m /local/models/resnet50_f16.bolt -i ./data/\n", loopTime, warmUp, threadsNum); } -void parse_options(int argc, char *argv[]) +int parse_options(int argc, char *argv[]) { std::cout << "\nPlease enter this command './benchmark --help' to get more usage " "information.\n"; @@ -55,7 +58,7 @@ void parse_options(int argc, char *argv[]) for (std::string arg : lineArgs) { if (arg == "--help" || arg == "-help" || arg == "--h" || arg == "-h") { print_benchmark_usage(); - exit(-1); + return 0; } } @@ -94,27 +97,35 @@ void parse_options(int argc, char *argv[]) default: std::cout << "Input option gets error, please check the params meticulously.\n"; print_benchmark_usage(); - exit(-1); + return 0; } } + return 1; } std::map> create_tensors_from_path( - std::string dataPath, std::shared_ptr pipeline) + std::string inputData, std::shared_ptr pipeline) { std::map inputDescMap = pipeline->get_input_desc(); - std::vector sourceDataTypes; - std::vector inputDescs; - for (auto iter : inputDescMap) { - TensorDesc curDesc = iter.second; - sourceDataTypes.push_back(curDesc.dt); - inputDescs.push_back(curDesc); - } std::vector input; - if (string_end_with(inputData, ".txt")) { - input = load_txt(inputData, inputDescs); + if (inputData != "" && is_directory(inputData)) { + for (auto iter : inputDescMap) { + std::string path = inputData + "/" + iter.first + ".txt"; + input.push_back(load_txt(path, {iter.second})[0]); + } } else { - input = load_bin(inputData, sourceDataTypes, inputDescs); + std::vector sourceDataTypes; + std::vector inputDescs; + for (auto iter : inputDescMap) { + TensorDesc curDesc = iter.second; + sourceDataTypes.push_back(curDesc.dt); + inputDescs.push_back(curDesc); + } + if (string_end_with(inputData, ".txt")) { + input = load_txt(inputData, inputDescs); + } else { + input = load_bin(inputData, sourceDataTypes, inputDescs); + } } std::map> model_tensors_input; int index = 0; @@ -145,7 +156,7 @@ std::map> get_output( for (auto iter : outMap) { Tensor result = *(iter.second); auto mem = (OclMemory *)result.get_memory(); - mem->get_mapped_ptr(); + UNI_PROFILE(mem->get_mapped_ptr(), "copy " + iter.first, std::string("output::copy")); } #else UNI_WARNING_LOG("this binary not support GPU, please recompile project with GPU " @@ -155,15 +166,26 @@ std::map> get_output( return outMap; } -int main(int argc, char *argv[]) +int benchmark(int argc, char *argv[]) { UNI_TIME_INIT - parse_options(argc, argv); + int ret = parse_options(argc, argv); + if (!ret) { + return 0; + } set_cpu_num_threads(threadsNum); // 1: set up the pipeline + double timeBegin = ut_time_ms(); auto pipeline = createPipeline(affinityPolicyName, modelPath, algorithmMapPath); +#ifdef _USE_GPU + if (std::string(affinityPolicyName) == std::string("GPU")) { + gcl_finish(OCLContext::getInstance().handle.get()); + } +#endif + double timeEnd = ut_time_ms(); + double prepareTime = timeEnd - timeBegin; // 2: create input data and feed the pipeline with it auto model_tensors_input = create_tensors_from_path(inputData, pipeline); @@ -171,16 +193,21 @@ int main(int argc, char *argv[]) std::map> outMap; // 3: warm up and run + UNI_TIME_STOP + timeBegin = ut_time_ms(); for (int i = 0; i < warmUp; i++) { pipeline->set_input_by_assign(model_tensors_input); pipeline->run(); outMap = get_output(pipeline, affinityPolicyName); } #ifdef _USE_GPU - if (strcmp(affinityPolicyName, "GPU") == 0) { + if (std::string(affinityPolicyName) == std::string("GPU")) { gcl_finish(OCLContext::getInstance().handle.get()); } #endif + timeEnd = ut_time_ms(); + double warmUpTime = timeEnd - timeBegin; + UNI_TIME_START double minTime = DBL_MAX; double maxTime = 0; @@ -201,10 +228,19 @@ int main(int argc, char *argv[]) print_result(outMap); UNI_TIME_STATISTICS - UNI_CI_LOG("total_time:%fms(loops=%d)\n", 1.0 * totalTime, loopTime); - UNI_CI_LOG("avg_time:%fms/data\n", 1.0 * totalTime / UNI_MAX(1, loopTime)); - UNI_CI_LOG("min_time:%fms/data\n", 1.0 * minTime); - UNI_CI_LOG("max_time:%fms/data\n", 1.0 * maxTime); + UNI_CI_LOG("model prepare_time:%fms\n", 1.0 * prepareTime); + UNI_CI_LOG("model warm_up_time:%fms\n", 1.0 * warmUpTime); + UNI_CI_LOG("run total_time:%fms(loops=%d)\n", 1.0 * totalTime, loopTime); + UNI_CI_LOG("run avg_time:%fms/data\n", 1.0 * totalTime / UNI_MAX(1, loopTime)); + UNI_CI_LOG("run min_time:%fms/data\n", 1.0 * minTime); + UNI_CI_LOG("run max_time:%fms/data\n", 1.0 * maxTime); pipeline->saveAlgorithmMapToFile(algorithmMapPath); return 0; } + +int main(int argc, char *argv[]) +{ + int ret = benchmark(argc, argv); + UNI_MEM_STATISTICS(); + return ret; +} diff --git a/inference/examples/bert/flow_tinybert.cpp b/inference/examples/bert/flow_tinybert.cpp index 859afc94..6268d168 100644 --- a/inference/examples/bert/flow_tinybert.cpp +++ b/inference/examples/bert/flow_tinybert.cpp @@ -36,7 +36,7 @@ std::map> inputOutput() tensors["tinybert_words"] = std::shared_ptr(new Tensor()); tensors["tinybert_words"]->resize(inputDesc); tensors["tinybert_words"]->alloc(); - memcpy(((CpuMemory *)tensors["tinybert_words"]->get_memory())->get_ptr(), words, + UNI_MEMCPY(((CpuMemory *)tensors["tinybert_words"]->get_memory())->get_ptr(), words, tensorNumBytes(inputDesc)); tensors["tinybert_positions"] = std::shared_ptr(new Tensor()); diff --git a/inference/examples/bert/graph_tinybert.cpp b/inference/examples/bert/graph_tinybert.cpp index 5eb77d5b..70bb9ea1 100644 --- a/inference/examples/bert/graph_tinybert.cpp +++ b/inference/examples/bert/graph_tinybert.cpp @@ -39,7 +39,7 @@ std::map> inputOutput() tensors["tinybert_words"] = std::shared_ptr(new Tensor()); tensors["tinybert_words"]->resize(inputDesc); tensors["tinybert_words"]->alloc(); - memcpy(((CpuMemory *)tensors["tinybert_words"]->get_memory())->get_ptr(), words, + UNI_MEMCPY(((CpuMemory *)tensors["tinybert_words"]->get_memory())->get_ptr(), words, tensorNumBytes(inputDesc)); tensors["tinybert_positions"] = std::shared_ptr(new Tensor()); diff --git a/inference/examples/bert/tinybert_test.h b/inference/examples/bert/tinybert_test.h index e3409cb2..7aba1717 100644 --- a/inference/examples/bert/tinybert_test.h +++ b/inference/examples/bert/tinybert_test.h @@ -146,7 +146,7 @@ inline void tinybertTest(int argc, loopTime = parse_res.loopTime.first; } - bool useGPU = (strcmp(affinityPolicyName, "GPU") == 0) ? true : false; + bool useGPU = std::string(affinityPolicyName) == std::string("GPU"); std::shared_ptr pipelineBase; UNI_PROFILE(pipelineBase = createPipeline(affinityPolicyName, modelPath, algorithmMapPath), std::string("bolt::prepare"), std::string("prepare")); diff --git a/inference/examples/c_api/Makefile b/inference/examples/c_api/Makefile new file mode 100644 index 00000000..d1e67afb --- /dev/null +++ b/inference/examples/c_api/Makefile @@ -0,0 +1,16 @@ +CC=aarch64-linux-android21-clang +CFLAGS=-O3 -I../../../inference/engine/include +LDFLAGS=../../../install_android-aarch64/lib/libbolt.a -llog \ + ../../../third_party/android-aarch64/opencl/lib/libOpenCL.so \ + -lm -lstdc++ + +targets: c_image_classification c_input_method + +c_image_classification: c_image_classification.o c_common.o c_test.o + ${CC} -o $@ $^ ${LDFLAGS} +c_input_method: c_input_method.o c_common.o c_test.o + ${CC} -o $@ $^ ${LDFLAGS} +%.o: %.c + $(CC) -c $(CFLAGS) $< -o $@ +clean: + rm -rf *.o c_image_classification c_input_method diff --git a/inference/examples/c_api/c_common.c b/inference/examples/c_api/c_common.c index 97e4d7ba..15d83476 100644 --- a/inference/examples/c_api/c_common.c +++ b/inference/examples/c_api/c_common.c @@ -88,7 +88,7 @@ void MallocTensor(int num, const DATA_FORMAT *df, void ***data) { - *data = malloc(sizeof(void *) * num); + *data = (void **)malloc(sizeof(void *) * num); for (int i = 0; i < num; i++) { int length = n[i] * c[i] * h[i] * w[i]; switch (dt[i]) { @@ -128,9 +128,7 @@ void CreateInference(int useModelFileStream, const char *algorithmMapPath, AFFINITY_TYPE affinity, ModelHandle *inferenceHandle, - ResultHandle *resultHandle, - int *inputNum, - char ***inputName) + ResultHandle *resultHandle) { if (useModelFileStream) { *inferenceHandle = CreateModelWithFileStream(modelPath, affinity, algorithmMapPath); @@ -153,12 +151,10 @@ void CreateInference(int useModelFileStream, *resultHandle = AllocAllResultHandle(*inferenceHandle); - *inputNum = in_num; - *inputName = in_name; - //for (int i = 0; i < in_num; i++) { - // free(in_name[i]); - //} - //free(in_name); + for (int i = 0; i < in_num; i++) { + free(in_name[i]); + } + free(in_name); free(in_n); free(in_c); free(in_h); diff --git a/inference/examples/c_api/c_common.h b/inference/examples/c_api/c_common.h index 49d8241c..10c3afe5 100644 --- a/inference/examples/c_api/c_common.h +++ b/inference/examples/c_api/c_common.h @@ -61,7 +61,5 @@ void CreateInference(int useModelFileStream, const char *algorithmMapPath, AFFINITY_TYPE affinity, ModelHandle *inferenceHandle, - ResultHandle *resultHandle, - int *inputNum, - char ***inputName); + ResultHandle *resultHandle); #endif diff --git a/inference/examples/c_api/c_image_classifification.c b/inference/examples/c_api/c_image_classification.c similarity index 86% rename from inference/examples/c_api/c_image_classifification.c rename to inference/examples/c_api/c_image_classification.c index 87b583da..e7414293 100644 --- a/inference/examples/c_api/c_image_classifification.c +++ b/inference/examples/c_api/c_image_classification.c @@ -18,15 +18,13 @@ int main(int argc, char *argv[]) ParseOptions(argc, argv); ModelHandle inferenceHandle; ResultHandle resultHandle; - int inNum; - char **inName; if (useFileStream) { char *modelFileStream = BuildFileStream(modelPath); CreateInference(useFileStream, modelFileStream, algorithmMapPath, affinity, - &inferenceHandle, &resultHandle, &inNum, &inName); + &inferenceHandle, &resultHandle); } else { - CreateInference(useFileStream, modelPath, algorithmMapPath, affinity, &inferenceHandle, - &resultHandle, &inNum, &inName); + CreateInference( + useFileStream, modelPath, algorithmMapPath, affinity, &inferenceHandle, &resultHandle); } int inputNum, *inputN, *inputC, *inputH, *inputW; @@ -39,6 +37,10 @@ int main(int argc, char *argv[]) MallocTensor(inputNum, inputName, inputN, inputC, inputH, inputW, inputDT, inputDF, &inputData); InitTensor(inputNum, inputName, inputN, inputC, inputH, inputW, inputDT, inputDF, inputData, 1); + PrintTensor(inputNum, inputName, inputN, inputC, inputH, inputW, inputDT, inputDF, inputData, + "input ", 8); + RunModel(inferenceHandle, resultHandle, inputNum, (const char **)inputName, inputData); + int outputNum, *outputN, *outputC, *outputH, *outputW; DATA_TYPE *outputDT; DATA_FORMAT *outputDF; @@ -46,12 +48,7 @@ int main(int argc, char *argv[]) void **outputData; CreateOutputTensorDesc(resultHandle, &outputNum, &outputName, &outputN, &outputC, &outputH, &outputW, &outputDT, &outputDF); - outputData = malloc(sizeof(void *) * outputNum); - - PrintTensor(inputNum, inputName, inputN, inputC, inputH, inputW, inputDT, inputDF, inputData, - "input ", 8); - RunModel(inferenceHandle, resultHandle, inNum, (const char **)inName, inputData); - + outputData = (void **)malloc(sizeof(void *) * outputNum); GetOutputDataFromResultHandle(resultHandle, outputNum, outputData); PrintTensor(outputNum, outputName, outputN, outputC, outputH, outputW, outputDT, outputDF, outputData, "output ", 8); @@ -59,10 +56,6 @@ int main(int argc, char *argv[]) FreeTensor(inputNum, inputName, inputN, inputC, inputH, inputW, inputDT, inputDF, inputData); FreeTensorDesc(outputNum, outputName, outputN, outputC, outputH, outputW, outputDT, outputDF); free(outputData); - for (int i = 0; i < inNum; i++) { - free(inName[i]); - } - free(inName); FreeResultHandle(resultHandle); DestroyModel(inferenceHandle); return 0; diff --git a/inference/examples/c_api/c_input_method.c b/inference/examples/c_api/c_input_method.c index 9baa2390..1172f23f 100644 --- a/inference/examples/c_api/c_input_method.c +++ b/inference/examples/c_api/c_input_method.c @@ -13,6 +13,7 @@ #include #include +#include "secure_c_wrapper.h" #include "../../examples/c_api/c_test.h" int main(int argc, char *argv[]) @@ -20,15 +21,13 @@ int main(int argc, char *argv[]) ParseOptions(argc, argv); ModelHandle inferenceHandle; ResultHandle resultHandle; - int inNum; - char **inName; if (useFileStream) { char *modelFileStream = BuildFileStream(modelPath); CreateInference(useFileStream, modelFileStream, algorithmMapPath, affinity, - &inferenceHandle, &resultHandle, &inNum, &inName); + &inferenceHandle, &resultHandle); } else { - CreateInference(useFileStream, modelPath, algorithmMapPath, affinity, &inferenceHandle, - &resultHandle, &inNum, &inName); + CreateInference( + useFileStream, modelPath, algorithmMapPath, affinity, &inferenceHandle, &resultHandle); } int inputNum, *inputN, *inputC, *inputH, *inputW; @@ -38,7 +37,7 @@ int main(int argc, char *argv[]) void **inputData; CreateInputTensorDesc(inferenceHandle, &inputNum, &inputName, &inputN, &inputC, &inputH, &inputW, &inputDT, &inputDF); - inputData = malloc(sizeof(void *) * inputNum); + inputData = (void **)malloc(sizeof(void *) * inputNum); int outputNum, *outputN, *outputC, *outputH, *outputW; DATA_TYPE *outputDT; @@ -47,7 +46,7 @@ int main(int argc, char *argv[]) void **outputData, **lastOutputData; CreateOutputTensorDesc(resultHandle, &outputNum, &outputName, &outputN, &outputC, &outputH, &outputW, &outputDT, &outputDF); - outputData = malloc(sizeof(void *) * outputNum); + outputData = (void **)malloc(sizeof(void *) * outputNum); MallocTensor(outputNum, outputName, outputN, outputC, outputH, outputW, outputDT, outputDF, &lastOutputData); InitTensor(outputNum, outputName, outputN, outputC, outputH, outputW, outputDT, outputDF, @@ -94,9 +93,9 @@ int main(int argc, char *argv[]) } PrintTensor(inputNum, inputName, inputN, inputC, inputH, inputW, inputDT, inputDF, inputData, " input ", 8); - ResizeModelInput(inferenceHandle, inNum, (const char **)inName, inputN, inputC, inputH, - inputW, inputDT, inputDF); - RunModel(inferenceHandle, resultHandle, inNum, (const char **)inName, inputData); + ResizeModelInput(inferenceHandle, inputNum, (const char **)inputName, inputN, inputC, + inputH, inputW, inputDT, inputDF); + RunModel(inferenceHandle, resultHandle, inputNum, (const char **)inputName, inputData); GetOutputDataFromResultHandle(resultHandle, outputNum, outputData); PrintTensor(outputNum, outputName, outputN, outputC, outputH, outputW, outputDT, outputDF, outputData, " output ", 8); @@ -107,10 +106,6 @@ int main(int argc, char *argv[]) FreeTensor(outputNum, outputName, outputN, outputC, outputH, outputW, outputDT, outputDF, lastOutputData); free(outputData); - for (int i = 0; i < inNum; i++) { - free(inName[i]); - } - free(inName); FreeResultHandle(resultHandle); DestroyModel(inferenceHandle); return 0; diff --git a/inference/examples/c_api/c_test.c b/inference/examples/c_api/c_test.c index 4bea1853..dd09de4d 100644 --- a/inference/examples/c_api/c_test.c +++ b/inference/examples/c_api/c_test.c @@ -13,13 +13,13 @@ #include #include -#include #include +#include #include "../../examples/c_api/c_test.h" char *modelPath = (char *)""; AFFINITY_TYPE affinity = CPU_HIGH_PERFORMANCE; -char *algorithmMapPath = (char *)"./"; +char *algorithmMapPath = NULL; int loopTime = 1; int useFileStream = 0; char *algorithmMapName = (char *)""; diff --git a/inference/examples/image_classification/classification.cpp b/inference/examples/image_classification/classification.cpp index be3746c1..3e0243fa 100644 --- a/inference/examples/image_classification/classification.cpp +++ b/inference/examples/image_classification/classification.cpp @@ -55,7 +55,7 @@ int main(int argc, char *argv[]) int category = -1; int loopTime = 1; if (!parse_res.model.second) { - exit(-1); + return 0; } if (parse_res.model.second) { modelPath = parse_res.model.first; @@ -137,7 +137,7 @@ int main(int argc, char *argv[]) cnn->run(); } #ifdef _USE_GPU - if (strcmp(affinityPolicyName, "GPU") == 0) { + if (std::string(affinityPolicyName) == std::string("GPU")) { gcl_finish(OCLContext::getInstance().handle.get()); } #endif diff --git a/inference/examples/image_matting/u2net.cpp b/inference/examples/image_matting/u2net.cpp index 26a99c90..1c5b21a8 100644 --- a/inference/examples/image_matting/u2net.cpp +++ b/inference/examples/image_matting/u2net.cpp @@ -121,7 +121,7 @@ std::shared_ptr preprocess(cv::Mat image, } } if (appending_channels != 0) { - memcpy(&(vec_transpose[iter_index]), &(vec_flow[0]), vec_flow.size() * sizeof(float)); + UNI_MEMCPY(&(vec_transpose[iter_index]), &(vec_flow[0]), vec_flow.size() * sizeof(float)); } return input_ptr; } diff --git a/inference/examples/machine_translation/nmt.cpp b/inference/examples/machine_translation/nmt.cpp index 61ab0872..f02b0b1c 100644 --- a/inference/examples/machine_translation/nmt.cpp +++ b/inference/examples/machine_translation/nmt.cpp @@ -45,7 +45,7 @@ int main(int argc, char *argv[]) if (parse_res.algoPath.second) { algorithmMapPath = parse_res.algoPath.first; } - bool useGPU = (strcmp(affinityPolicyName, "GPU") == 0) ? true : false; + bool useGPU = std::string(affinityPolicyName) == std::string("GPU"); auto pipeline = createPipeline(affinityPolicyName, modelPath, algorithmMapPath); diff --git a/inference/examples/machine_translation/tsc_ssru.cpp b/inference/examples/machine_translation/tsc_ssru.cpp new file mode 100644 index 00000000..17106255 --- /dev/null +++ b/inference/examples/machine_translation/tsc_ssru.cpp @@ -0,0 +1,364 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +#ifndef _H_TSC_SSRU_TEST +#define _H_TSC_SSRU_TEST + +#include "inference.hpp" +#include "data_loader.hpp" +#include "profiling.h" +#include "parse_command.h" + +int main(int argc, char *argv[]) +{ + UNI_TIME_INIT + ParseRes parse_res; + parseCommandLine(argc, argv, &parse_res, "examples"); + + char *modelPath = (char *)""; + char *sequenceDirectory = (char *)""; + char *affinityPolicyName = (char *)""; + char *algorithmMapPath = (char *)""; + int loopTime = 1; + + if (!parse_res.model.second) { + exit(-1); + } + if (parse_res.model.second) { + modelPath = parse_res.model.first; + } + if (parse_res.archInfo.second) { + affinityPolicyName = parse_res.archInfo.first; + } + if (parse_res.algoPath.second) { + algorithmMapPath = parse_res.algoPath.first; + } + if (parse_res.loopTime.second) { + loopTime = parse_res.loopTime.first; + } + bool useGPU = std::string(affinityPolicyName) == std::string("GPU"); + std::shared_ptr pipelineBase; + UNI_PROFILE(pipelineBase = createPipeline(affinityPolicyName, modelPath, algorithmMapPath), + std::string("bolt::prepare"), std::string("prepare")); + + U32 batch = 4; + U32 inputLen = 55; + U32 seqLen = batch * inputLen; + U32 shortlistLen = 25100; + U32 input_ids[] = { + 2583, + 16370, + 422, + 175, + 11445, + 38, + 156, + 16718, + 13, + 345, + 1485, + 3677, + 2, + 2905, + 845, + 17379, + 7408, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 2583, + 16370, + 43, + 5, + 2905, + 845, + 17379, + 109, + 16740, + 4, + 3339, + 12550, + 19144, + 55, + 257, + 7, + 156, + 18, + 1961, + 22348, + 1609, + 30, + 4, + 22068, + 12143, + 7, + 18, + 1394, + 609, + 172, + 4, + 1634, + 3, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 5999, + 1567, + 55, + 1588, + 2, + 331, + 15, + 1311, + 16969, + 8, + 6134, + 7, + 15, + 3770, + 7120, + 823, + 5, + 75, + 55, + 679, + 4508, + 2, + 5036, + 6753, + 47, + 16370, + 14288, + 4, + 3540, + 4862, + 6112, + 623, + 156, + 1124, + 82, + 278, + 1981, + 150, + 122, + 18183, + 55, + 13, + 42, + 15, + 33, + 4759, + 569, + 85, + 62, + 6, + 4, + 910, + 3873, + 3, + 0, + 1056, + 345, + 1485, + 3677, + 2, + 122, + 278, + 7088, + 107, + 1089, + 21486, + 9584, + 5, + 8, + 1329, + 11445, + 38, + 156, + 16718, + 13, + 15880, + 10997, + 3, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + }; + F32 *masks = (F32 *)malloc(seqLen * sizeof(F32)); + U32 h_sequence_length[] = {18, 34, 55, 24}; + for (U32 i = 0; i < batch; ++i) { + for (U32 j = 0; j < h_sequence_length[i]; ++j) { + masks[i * inputLen + j] = 1.0f; + } + for (U32 j = h_sequence_length[i]; j < inputLen; ++j) { + masks[i * inputLen + j] = 0.0f; + } + } + + U32 *positions = (U32 *)malloc(seqLen * sizeof(U32)); + for (U32 i = 0; i < seqLen; ++i) { + positions[i] = i % inputLen; + } + + U32 *shortlist = (U32 *)malloc(shortlistLen * sizeof(U32)); + for (U32 i = 0; i < shortlistLen; ++i) { + shortlist[i] = i; + } + + I32 trueRes[] = {2583, 16370, 14386, 14745, 2584, 37, 12, 14143, 2, 72, 3219, 2479, 19, 23, + 3268, 2, 13166, 12, 8506, 7585, 17379, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2583, 16370, 5, 8506, 61, 7585, 17379, 132, 13166, 2, 12, 24232, 17, 13955, + 813, 523, 468, 1406, 725, 2027, 725, 2027, 725, 27, 12, 6596, 14, 13039, 2, 10, 2471, 5104, + 61, 6584, 1048, 19, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5999, 1567, 55, 1588, 2, 615, 863, 6452, 9, + 9257, 93, 3821, 4579, 5369, 300, 2151, 8386, 2, 1195, 6753, 2, 86, 16370, 13071, 2, 86, 10, + 1075, 5, 153, 6112, 72, 272, 1232, 35, 9869, 1134, 2, 115, 97, 35, 9869, 1134, 2, 115, 97, + 35, 9869, 1134, 2, 44, 33, 12604, 569, 1080, 62, 6, 12, 7185, 23, 171, 3, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 1760, 3219, 2479, 19, 23, 3268, 2, 1134, 97, 3771, 14, 23, 4980, 23969, + 15532, 14, 37, 5903, 9, 14745, 2584, 35, 10, 14143, 9300, 2, 72, 17832, 23, 24330, 3, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + // load sequences + const char *inputNames[4] = {"encoder_positions", "encoder_words", "nmt_mask", "shortlist"}; + const char *outputNames[1] = {"decoder_output"}; + + std::map inputDescMap; + inputDescMap[inputNames[0]] = tensor2d(DT_U32, batch, inputLen); + inputDescMap[inputNames[1]] = tensor2d(DT_U32, batch, inputLen); + inputDescMap[inputNames[2]] = tensor2d(DT_F32, batch, inputLen); + inputDescMap[inputNames[3]] = tensor2d(DT_U32, 1, shortlistLen); + pipelineBase->reready(inputDescMap); + + std::map> inputs; + inputs[inputNames[0]] = std::shared_ptr((U8 *)positions); + inputs[inputNames[1]] = std::shared_ptr((U8 *)input_ids, [](U8 *) {}); + inputs[inputNames[2]] = std::shared_ptr((U8 *)masks); + inputs[inputNames[3]] = std::shared_ptr((U8 *)shortlist); + + pipelineBase->set_input_by_assign(inputs); + double timeBegin = ut_time_ms(); + for (int i = 0; i < loopTime; ++i) { + pipelineBase->run(); + } + double timeEnd = ut_time_ms(); + double totalTime = (timeEnd - timeBegin); + + Tensor decoder_output = pipelineBase->get_tensor_by_name(outputNames[0]); + U32 outputNum = decoder_output.length(); + for (U32 i = 0; i < outputNum; ++i) { + if (decoder_output.element(i) != trueRes[i]) { + UNI_CI_LOG("ERROR: Get Wrong Result!\n"); + } + } + UNI_CI_LOG("avg_time: %fms/sequence\n", 1.0 * totalTime / loopTime); + return 0; +} + +#endif diff --git a/inference/examples/ultra_face/ultra_face.h b/inference/examples/ultra_face/ultra_face.h index e5111fb9..89a9608b 100644 --- a/inference/examples/ultra_face/ultra_face.h +++ b/inference/examples/ultra_face/ultra_face.h @@ -13,7 +13,6 @@ #ifndef _H_ULTRA_FACE #define _H_ULTRA_FACE -#define clip(x, y) (x < 0 ? 0 : (x > y ? y : x)) #include #include #include @@ -61,6 +60,13 @@ std::vector> featuremap_size; std::vector> shrinkage_size; std::vector w_h_list; std::vector> priors = {}; + +inline float clip(float x, float y) +{ + float ret = (x < 0 ? 0 : (x > y ? y : x)); + return ret; +} + inline void prior_boxes_generator( int input_width, int input_length, float score_threshold, float iou_threshold) { @@ -176,7 +182,7 @@ inline void nms(std::vector &input, std::vector &output, int total += exp(buf[i].score); } FaceInfo rects; - memset(&rects, 0, sizeof(rects)); + UNI_MEMSET(&rects, 0, sizeof(rects)); for (unsigned int i = 0; i < buf.size(); i++) { float rate = exp(buf[i].score) / total; rects.x1 += buf[i].x1 * rate; diff --git a/inference/examples/voice_wake_up/slide_tdnn.cpp b/inference/examples/voice_wake_up/slide_tdnn.cpp index db872ab0..cff3971d 100644 --- a/inference/examples/voice_wake_up/slide_tdnn.cpp +++ b/inference/examples/voice_wake_up/slide_tdnn.cpp @@ -159,13 +159,13 @@ int main(int argc, char *argv[]) Tensor buffer = Tensor::alloc_sized(inputDesc); std::shared_ptr dst = ((CpuMemory *)buffer.get_memory())->get_shared_ptr(); model_tensors_input[inputName] = dst; - memset(dst.get(), 0, frameNum * tileSize); + UNI_MEMSET(dst.get(), 0, frameNum * tileSize); // 3: run std::map> outMap; double timeBegin = ut_time_ms(); for (int i = 0; i < frameNum; i++) { - memcpy(dst.get() + (frameNum - i - 1) * tileSize, src.get(), (i + 1) * tileSize); + UNI_MEMCPY(dst.get() + (frameNum - i - 1) * tileSize, src.get(), (i + 1) * tileSize); pipeline->set_input_by_assign(model_tensors_input); pipeline->run(); outMap = get_output(pipeline, affinityPolicyName); diff --git a/inference/flow/src/node.cpp b/inference/flow/src/node.cpp index 7dd5fc20..6b2b64aa 100644 --- a/inference/flow/src/node.cpp +++ b/inference/flow/src/node.cpp @@ -221,7 +221,7 @@ EE Node::run() void *src = ((CpuMemory *)inferenceResult[name]->get_memory())->get_ptr(); void *dst = ((CpuMemory *)postprocessInputs[name]->get_memory())->get_ptr(); if (src != dst) { - memcpy(dst, src, tensorNumBytes(desc)); + UNI_MEMCPY(dst, src, tensorNumBytes(desc)); } } } diff --git a/install.sh b/install.sh index 983f95f8..a3d0fad5 100644 --- a/install.sh +++ b/install.sh @@ -8,6 +8,7 @@ target="" build_threads="8" converter="on" use_serial="on" +use_neon="on" use_fp32="on" use_fp16="on" use_int8="on" @@ -26,7 +27,7 @@ Build bolt library. Mandatory arguments to long options are mandatory for short options too. -h, --help display this help and exit. - --target= target device system and hardware setting, currently only support theses targets: + --target= target device system and hardware setting. xxx_blank will use shell environment variables CC, CXX, CFLAGS and CXXFLAGS, e.g. linux-aarch64_blank is for ARM64 server. currently only support theses targets: EOF print_targets cat < set to use serial calculation(default: ON). + --neon= set to use arm neon calculation(default: ON when using for arm platform). --fp32= set to use float32 calculation(default: ON). --fp16= set to use float16 calculation on arm aarch64(default: ON on aarch64, OFF on others). --int8= set to use int8 calculation on arm aarch64(default: ON on aarch64, OFF on others). @@ -49,7 +53,7 @@ EOF } cmake_options="" -TEMP=`getopt -o "ht:c:" -al target:,threads:,help,converter:,example,debug,profile,shared,gpu,openmp,flow,serial:,fp32:,fp16:,int8:,clean -- "$@"` +TEMP=`getopt -o "ht:c:" -al target:,threads:,help,converter:,example,debug,profile,shared,gpu,openmp,flow,serial:,neon:,fp32:,fp16:,int8:,train,clean,secure -- "$@"` if [[ $? != 0 ]]; then echo "[ERROR] ${script_name} terminating..." >&2 exit 1 @@ -99,6 +103,9 @@ while true ; do --serial) use_serial=$2 shift 2 ;; + --neon) + use_neon=$2 + shift 2 ;; --fp32) use_fp32=$2 shift 2 ;; @@ -108,6 +115,12 @@ while true ; do --int8) use_int8=$2 shift 2 ;; + --train) + cmake_options="${cmake_options} -DUSE_TRAINING=ON -DRAUL_CONFIG_BLAS_VENDOR=Huawei" + shift ;; + --secure) + cmake_options="${cmake_options} -DUSE_SECURE_C=ON" + shift ;; --clean) clean="on" shift ;; @@ -127,7 +140,7 @@ target=$(map_target ${target}) check_target ${target} if [[ "${converter}" == "ON" || "${converter}" == "on" ]]; then - cmake_options="${cmake_options} -DUSE_CAFFE=ON -DUSE_ONNX=ON -DUSE_TFLITE=ON -DUSE_TENSORFLOW=ON" + cmake_options="${cmake_options} -DUSE_CAFFE=ON -DUSE_ONNX=ON -DUSE_TFLITE=ON -DUSE_TENSORFLOW=ON -DUSE_MINDSPORE=ON" fi source ${script_dir}/scripts/setup_compiler.sh || exit 1 @@ -167,7 +180,9 @@ else cmake_options="${cmake_options} -DUSE_FP32=OFF" fi if [[ ${target} =~ aarch64 ]]; then - cmake_options="${cmake_options} -DUSE_NEON=ON" + if [[ "${use_neon}" == "ON" || "${use_neon}" == "on" ]]; then + cmake_options="${cmake_options} -DUSE_NEON=ON" + fi if [[ ${cmake_options} =~ USE_GPU=ON ]]; then use_fp16="on" fi @@ -183,6 +198,10 @@ if [[ ${target} =~ aarch64 ]]; then fi fi rm -rf test.log main + else + if [[ "${use_int8}" == "ON" || "${use_int8}" == "on" ]]; then + cmake_options="${cmake_options} -DUSE_INT8=ON" + fi fi elif [[ ${target} =~ avx ]]; then cmake_options="${cmake_options} -DUSE_X86=ON" @@ -200,7 +219,9 @@ else fi fi if [[ "${target}" == "linux-arm_himix100" || ${target} =~ armv7 || "${target}" == "linux-arm_musleabi" ]]; then - cmake_options="${cmake_options} -DUSE_NEON=ON" + if [[ "${use_neon}" == "ON" || "${use_neon}" == "on" ]]; then + cmake_options="${cmake_options} -DUSE_NEON=ON" + fi if [[ "${use_int8}" == "ON" || "${use_int8}" == "on" ]]; then cmake_options="${cmake_options} -DUSE_INT8=ON" fi @@ -238,6 +259,6 @@ if [[ ${cmake_options} =~ USE_FLOW=ON ]]; then fi ${BOLT_ROOT}/kit/setup.sh ${platform} ${kit_flow} || exit 1 -${MAKE} test ARGS="-V" +${MAKE} test ARGS="-V" || exit 1 cd .. diff --git a/kit/Android/ChineseSpeechRecognition/app/src/main/assets/encoder_flow.prototxt b/kit/Android/ChineseSpeechRecognition/app/src/main/assets/encoder_flow.prototxt new file mode 100644 index 00000000..a5cd74d6 --- /dev/null +++ b/kit/Android/ChineseSpeechRecognition/app/src/main/assets/encoder_flow.prototxt @@ -0,0 +1,350 @@ +name: "encoder" +input: "sounds" +input: "encoder_block0_trunk0_layer0_mem" +input: "encoder_block0_trunk0_layer1_mem" +input: "encoder_block1_trunk1_layer0_kmem" +input: "encoder_block1_trunk1_layer0_vmem" +input: "encoder_block1_trunk1_layer1_kmem" +input: "encoder_block1_trunk1_layer1_vmem" +input: "encoder_block2_trunk0_layer0_mem" +input: "encoder_block2_trunk0_layer1_mem" +input: "encoder_block2_trunk1_layer0_kmem" +input: "encoder_block2_trunk1_layer0_vmem" +input: "encoder_block2_trunk1_layer1_kmem" +input: "encoder_block2_trunk1_layer1_vmem" +input: "encoder_block3_trunk0_layer0_mem" +input: "encoder_block3_trunk0_layer1_mem" +input: "encoder_block3_trunk1_layer0_kmem" +input: "encoder_block3_trunk1_layer0_vmem" +input: "encoder_block3_trunk1_layer1_kmem" +input: "encoder_block3_trunk1_layer1_vmem" +input: "encoder_block3_trunk1_layer2_kmem" +input: "encoder_block3_trunk1_layer2_vmem" +input: "encoder_block3_trunk1_layer3_kmem" +input: "encoder_block3_trunk1_layer3_vmem" +output: "encoder_block3_transformer_ln" +output: "encoder_block0_conv0_neg_slice" +output: "encoder_block0_conv1_neg_slice" +output: "encoder_block1_transformer_layer0_k_neg_slice" +output: "encoder_block1_transformer_layer0_v_neg_slice" +output: "encoder_block1_transformer_layer1_k_neg_slice" +output: "encoder_block1_transformer_layer1_v_neg_slice" +output: "encoder_block2_conv0_neg_slice" +output: "encoder_block2_conv1_neg_slice" +output: "encoder_block2_transformer_layer0_k_neg_slice" +output: "encoder_block2_transformer_layer0_v_neg_slice" +output: "encoder_block2_transformer_layer1_k_neg_slice" +output: "encoder_block2_transformer_layer1_v_neg_slice" +output: "encoder_block3_conv0_neg_slice" +output: "encoder_block3_conv1_neg_slice" +output: "encoder_block3_transformer_layer0_k_neg_slice" +output: "encoder_block3_transformer_layer0_v_neg_slice" +output: "encoder_block3_transformer_layer1_k_neg_slice" +output: "encoder_block3_transformer_layer1_v_neg_slice" +output: "encoder_block3_transformer_layer2_k_neg_slice" +output: "encoder_block3_transformer_layer2_v_neg_slice" +output: "encoder_block3_transformer_layer3_k_neg_slice" +output: "encoder_block3_transformer_layer3_v_neg_slice" +node { + name: "sounds" + type: "Input" + output: "sounds" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 15 + input_dim: 128 +} +node { + name: "encoder_block0_trunk0_layer0_mem" + type: "Input" + output: "encoder_block0_trunk0_layer0_mem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 2 + input_dim: 128 + input_dim: 1 +} +node { + name: "encoder_block0_trunk0_layer1_mem" + type: "Input" + output: "encoder_block0_trunk0_layer1_mem" + input_type: "FLOAT32" + input_format: "NCHWC8" + input_dim: 1 + input_dim: 32 + input_dim: 1 + input_dim: 64 +} +node { + name: "encoder_block1_trunk1_layer0_kmem" + type: "Input" + output: "encoder_block1_trunk1_layer0_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 5 + input_dim: 6 + input_dim: 64 +} +node { + name: "encoder_block1_trunk1_layer0_vmem" + type: "Input" + output: "encoder_block1_trunk1_layer0_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 5 + input_dim: 6 + input_dim: 64 +} +node { + name: "encoder_block1_trunk1_layer1_kmem" + type: "Input" + output: "encoder_block1_trunk1_layer1_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 7 + input_dim: 6 + input_dim: 64 +} +node { + name: "encoder_block1_trunk1_layer1_vmem" + type: "Input" + output: "encoder_block1_trunk1_layer1_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 7 + input_dim: 6 + input_dim: 64 +} +node { + name: "encoder_block2_trunk0_layer0_mem" + type: "Input" + output: "encoder_block2_trunk0_layer0_mem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 2 + input_dim: 384 +} +node { + name: "encoder_block2_trunk0_layer1_mem" + type: "Input" + output: "encoder_block2_trunk0_layer1_mem" + input_type: "FLOAT32" + input_format: "NCHWC8" + input_dim: 1 + input_dim: 1024 + input_dim: 1 + input_dim: 1 +} +node { + name: "encoder_block2_trunk1_layer0_kmem" + type: "Input" + output: "encoder_block2_trunk1_layer0_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 7 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block2_trunk1_layer0_vmem" + type: "Input" + output: "encoder_block2_trunk1_layer0_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 7 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block2_trunk1_layer1_kmem" + type: "Input" + output: "encoder_block2_trunk1_layer1_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 9 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block2_trunk1_layer1_vmem" + type: "Input" + output: "encoder_block2_trunk1_layer1_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 9 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk0_layer0_mem" + type: "Input" + output: "encoder_block3_trunk0_layer0_mem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 2 + input_dim: 512 +} +node { + name: "encoder_block3_trunk0_layer1_mem" + type: "Input" + output: "encoder_block3_trunk0_layer1_mem" + input_type: "FLOAT32" + input_format: "NCHWC8" + input_dim: 1 + input_dim: 1024 + input_dim: 1 + input_dim: 1 +} +node { + name: "encoder_block3_trunk1_layer0_kmem" + type: "Input" + output: "encoder_block3_trunk1_layer0_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 9 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer0_vmem" + type: "Input" + output: "encoder_block3_trunk1_layer0_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 9 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer1_kmem" + type: "Input" + output: "encoder_block3_trunk1_layer1_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 15 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer1_vmem" + type: "Input" + output: "encoder_block3_trunk1_layer1_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 15 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer2_kmem" + type: "Input" + output: "encoder_block3_trunk1_layer2_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 23 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer2_vmem" + type: "Input" + output: "encoder_block3_trunk1_layer2_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 23 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer3_kmem" + type: "Input" + output: "encoder_block3_trunk1_layer3_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 31 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_block3_trunk1_layer3_vmem" + type: "Input" + output: "encoder_block3_trunk1_layer3_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 31 + input_dim: 8 + input_dim: 64 +} +node { + name: "encoder_inference" + type: "Inference" + input: "sounds" + input: "encoder_block0_trunk0_layer0_mem" + input: "encoder_block0_trunk0_layer1_mem" + input: "encoder_block1_trunk1_layer0_kmem" + input: "encoder_block1_trunk1_layer0_vmem" + input: "encoder_block1_trunk1_layer1_kmem" + input: "encoder_block1_trunk1_layer1_vmem" + input: "encoder_block2_trunk0_layer0_mem" + input: "encoder_block2_trunk0_layer1_mem" + input: "encoder_block2_trunk1_layer0_kmem" + input: "encoder_block2_trunk1_layer0_vmem" + input: "encoder_block2_trunk1_layer1_kmem" + input: "encoder_block2_trunk1_layer1_vmem" + input: "encoder_block3_trunk0_layer0_mem" + input: "encoder_block3_trunk0_layer1_mem" + input: "encoder_block3_trunk1_layer0_kmem" + input: "encoder_block3_trunk1_layer0_vmem" + input: "encoder_block3_trunk1_layer1_kmem" + input: "encoder_block3_trunk1_layer1_vmem" + input: "encoder_block3_trunk1_layer2_kmem" + input: "encoder_block3_trunk1_layer2_vmem" + input: "encoder_block3_trunk1_layer3_kmem" + input: "encoder_block3_trunk1_layer3_vmem" + output: "encoder_block3_transformer_ln" + output: "encoder_block0_conv0_neg_slice" + output: "encoder_block0_conv1_neg_slice" + output: "encoder_block1_transformer_layer0_k_neg_slice" + output: "encoder_block1_transformer_layer0_v_neg_slice" + output: "encoder_block1_transformer_layer1_k_neg_slice" + output: "encoder_block1_transformer_layer1_v_neg_slice" + output: "encoder_block2_conv0_neg_slice" + output: "encoder_block2_conv1_neg_slice" + output: "encoder_block2_transformer_layer0_k_neg_slice" + output: "encoder_block2_transformer_layer0_v_neg_slice" + output: "encoder_block2_transformer_layer1_k_neg_slice" + output: "encoder_block2_transformer_layer1_v_neg_slice" + output: "encoder_block3_conv0_neg_slice" + output: "encoder_block3_conv1_neg_slice" + output: "encoder_block3_transformer_layer0_k_neg_slice" + output: "encoder_block3_transformer_layer0_v_neg_slice" + output: "encoder_block3_transformer_layer1_k_neg_slice" + output: "encoder_block3_transformer_layer1_v_neg_slice" + output: "encoder_block3_transformer_layer2_k_neg_slice" + output: "encoder_block3_transformer_layer2_v_neg_slice" + output: "encoder_block3_transformer_layer3_k_neg_slice" + output: "encoder_block3_transformer_layer3_v_neg_slice" + infer_output_size_parameter: "encoderInferOutputSize" + preprocess_parameter: "encoderPreProcess" + inference_parameter: "/data/user/0/com.huawei.noah/cache/asr_convolution_transformer_encoder_f32.bolt" +} diff --git a/kit/Android/ChineseSpeechRecognition/app/src/main/assets/joint_flow.prototxt b/kit/Android/ChineseSpeechRecognition/app/src/main/assets/joint_flow.prototxt new file mode 100644 index 00000000..d8ceb477 --- /dev/null +++ b/kit/Android/ChineseSpeechRecognition/app/src/main/assets/joint_flow.prototxt @@ -0,0 +1,33 @@ +name: "joint_flow" +input: "encoder" +input: "prediction_net" +output: "output_argmax" +node { + name: "encoder" + type: "Input" + output: "encoder" + input_type: "FLOAT32" + input_format: "MTK" + input_dim: 1 + input_dim: 1 + input_dim: 512 +} +node { + name: "prediction_net" + type: "Input" + output: "prediction_net" + input_type: "FLOAT32" + input_format: "MTK" + input_dim: 1 + input_dim: 1 + input_dim: 512 +} +node { + name: "joint_inference" + type: "Inference" + input: "encoder" + input: "prediction_net" + output: "output_argmax" + infer_output_size_parameter: "jointInferOutputSize" + inference_parameter: "/data/user/0/com.huawei.noah/cache/asr_convolution_transformer_joint_net_f32.bolt" +} diff --git a/kit/Android/ChineseSpeechRecognition/app/src/main/assets/pinyin2hanzi_flow.prototxt b/kit/Android/ChineseSpeechRecognition/app/src/main/assets/pinyin2hanzi_flow.prototxt new file mode 100644 index 00000000..a493aa81 --- /dev/null +++ b/kit/Android/ChineseSpeechRecognition/app/src/main/assets/pinyin2hanzi_flow.prototxt @@ -0,0 +1,24 @@ +name: "pinyin2hanzi_flow" +input: "pinyin" +output: "hanzi_squeeze/Squeeze" +node { + name: "pinyin" + type: "Input" + output: "pinyin" + input_type: "UINT32" + input_format: "NORMAL" + input_dim: 1 + input_dim: 32 +} +node { + name: "pinyin2hanzi_inference" + type: "Inference" + input: "pinyin" + output: "hanzi_squeeze/Squeeze" + infer_output_size_parameter: "pinyin2hanziInferOutputSize" + preprocess_parameter: "pinyin2hanziPreProcess" + preprocess_parameter: "/data/user/0/com.huawei.noah/cache/pinyin_lm_embedding.bin" + preprocess_parameter: "1601" + preprocess_parameter: "512" + inference_parameter: "/data/user/0/com.huawei.noah/cache/cnn_pinyin_lm_b7h512e4_cn_en_20200518_cloud_fp32_f32.bolt" +} diff --git a/kit/Android/ChineseSpeechRecognition/app/src/main/assets/prediction_flow.prototxt b/kit/Android/ChineseSpeechRecognition/app/src/main/assets/prediction_flow.prototxt new file mode 100644 index 00000000..c5707a29 --- /dev/null +++ b/kit/Android/ChineseSpeechRecognition/app/src/main/assets/prediction_flow.prototxt @@ -0,0 +1,139 @@ +name: "prediction" +input: "label" +input: "prediction_net_layer0_kmem" +input: "prediction_net_layer0_vmem" +input: "prediction_net_layer1_kmem" +input: "prediction_net_layer1_vmem" +input: "prediction_net_layer2_kmem" +input: "prediction_net_layer2_vmem" +input: "prediction_net_layer3_kmem" +input: "prediction_net_layer3_vmem" +output: "prediction_net_ln" +output: "prediction_net_layer0_k_neg_slice" +output: "prediction_net_layer0_v_neg_slice" +output: "prediction_net_layer1_k_neg_slice" +output: "prediction_net_layer1_v_neg_slice" +output: "prediction_net_layer2_k_neg_slice" +output: "prediction_net_layer2_v_neg_slice" +output: "prediction_net_layer3_k_neg_slice" +output: "prediction_net_layer3_v_neg_slice" +node { + name: "label" + type: "Input" + output: "label" + input_type: "UINT32" + input_format: "NORMAL" + input_dim: 1 + input_dim: 1 +} +node { + name: "prediction_net_layer0_kmem" + type: "Input" + output: "prediction_net_layer0_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 3 + input_dim: 8 + input_dim: 64 +} +node { + name: "prediction_net_layer0_vmem" + type: "Input" + output: "prediction_net_layer0_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 3 + input_dim: 8 + input_dim: 64 +} +node { + name: "prediction_net_layer1_kmem" + type: "Input" + output: "prediction_net_layer1_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 5 + input_dim: 8 + input_dim: 64 +} +node { + name: "prediction_net_layer1_vmem" + type: "Input" + output: "prediction_net_layer1_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 5 + input_dim: 8 + input_dim: 64 +} +node { + name: "prediction_net_layer2_kmem" + type: "Input" + output: "prediction_net_layer2_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 7 + input_dim: 8 + input_dim: 64 +} +node { + name: "prediction_net_layer2_vmem" + type: "Input" + output: "prediction_net_layer2_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 7 + input_dim: 8 + input_dim: 64 +} +node { + name: "prediction_net_layer3_kmem" + type: "Input" + output: "prediction_net_layer3_kmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 9 + input_dim: 8 + input_dim: 64 +} +node { + name: "prediction_net_layer3_vmem" + type: "Input" + output: "prediction_net_layer3_vmem" + input_type: "FLOAT32" + input_format: "NCHW" + input_dim: 1 + input_dim: 9 + input_dim: 8 + input_dim: 64 +} +node { + name: "prediction_inference" + input: "label" + input: "prediction_net_layer0_kmem" + input: "prediction_net_layer0_vmem" + input: "prediction_net_layer1_kmem" + input: "prediction_net_layer1_vmem" + input: "prediction_net_layer2_kmem" + input: "prediction_net_layer2_vmem" + input: "prediction_net_layer3_kmem" + input: "prediction_net_layer3_vmem" + output: "prediction_net_ln" + output: "prediction_net_layer0_k_neg_slice" + output: "prediction_net_layer0_v_neg_slice" + output: "prediction_net_layer1_k_neg_slice" + output: "prediction_net_layer1_v_neg_slice" + output: "prediction_net_layer2_k_neg_slice" + output: "prediction_net_layer2_v_neg_slice" + output: "prediction_net_layer3_k_neg_slice" + output: "prediction_net_layer3_v_neg_slice" + infer_output_size_parameter: "predictionInferOutputSize" + inference_parameter: "/data/user/0/com.huawei.noah/cache/asr_convolution_transformer_prediction_net_f32.bolt" +} diff --git a/kit/Android/ReadingComprehension/.gitignore b/kit/Android/ReadingComprehension/.gitignore new file mode 100644 index 00000000..aa724b77 --- /dev/null +++ b/kit/Android/ReadingComprehension/.gitignore @@ -0,0 +1,15 @@ +*.iml +.gradle +/local.properties +/.idea/caches +/.idea/libraries +/.idea/modules.xml +/.idea/workspace.xml +/.idea/navEditor.xml +/.idea/assetWizardSettings.xml +.DS_Store +/build +/captures +.externalNativeBuild +.cxx +local.properties diff --git a/kit/Android/ReadingComprehension/app/.gitignore b/kit/Android/ReadingComprehension/app/.gitignore new file mode 100644 index 00000000..42afabfd --- /dev/null +++ b/kit/Android/ReadingComprehension/app/.gitignore @@ -0,0 +1 @@ +/build \ No newline at end of file diff --git a/kit/Android/ReadingComprehension/app/build.gradle b/kit/Android/ReadingComprehension/app/build.gradle new file mode 100644 index 00000000..2e3ece18 --- /dev/null +++ b/kit/Android/ReadingComprehension/app/build.gradle @@ -0,0 +1,48 @@ +plugins { + id 'com.android.application' +} + +android { + compileSdkVersion 32 + buildToolsVersion "30.0.3" + + defaultConfig { + applicationId "com.huawei.noah" + minSdkVersion 16 + targetSdkVersion 32 + versionCode 1 + versionName "1.0" + + testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner" + + ndk{ + abiFilters "arm64-v8a" + } + } + + buildTypes { + release { + minifyEnabled false + proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro' + } + } + compileOptions { + sourceCompatibility JavaVersion.VERSION_1_8 + targetCompatibility JavaVersion.VERSION_1_8 + } + + + buildFeatures { + viewBinding true + } +} + +dependencies { + + implementation 'androidx.appcompat:appcompat:1.4.0' + implementation 'com.google.android.material:material:1.4.0' + implementation 'androidx.constraintlayout:constraintlayout:2.1.2' + testImplementation 'junit:junit:4.+' + androidTestImplementation 'androidx.test.ext:junit:1.1.3' + androidTestImplementation 'androidx.test.espresso:espresso-core:3.3.0' +} \ No newline at end of file diff --git a/kit/Android/ReadingComprehension/app/proguard-rules.pro b/kit/Android/ReadingComprehension/app/proguard-rules.pro new file mode 100644 index 00000000..481bb434 --- /dev/null +++ b/kit/Android/ReadingComprehension/app/proguard-rules.pro @@ -0,0 +1,21 @@ +# Add project specific ProGuard rules here. +# You can control the set of applied configuration files using the +# proguardFiles setting in build.gradle. +# +# For more details, see +# http://developer.android.com/guide/developing/tools/proguard.html + +# If your project uses WebView with JS, uncomment the following +# and specify the fully qualified class name to the JavaScript interface +# class: +#-keepclassmembers class fqcn.of.javascript.interface.for.webview { +# public *; +#} + +# Uncomment this to preserve the line number information for +# debugging stack traces. +#-keepattributes SourceFile,LineNumberTable + +# If you keep the line number information, uncomment this to +# hide the original source file name. +#-renamesourcefileattribute SourceFile \ No newline at end of file diff --git a/kit/Android/ReadingComprehension/app/src/main/AndroidManifest.xml b/kit/Android/ReadingComprehension/app/src/main/AndroidManifest.xml new file mode 100644 index 00000000..57d457d5 --- /dev/null +++ b/kit/Android/ReadingComprehension/app/src/main/AndroidManifest.xml @@ -0,0 +1,23 @@ + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/MainActivity.java b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/MainActivity.java new file mode 100644 index 00000000..f207e6e0 --- /dev/null +++ b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/MainActivity.java @@ -0,0 +1,250 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +package com.huawei.noah; + +import androidx.appcompat.app.AppCompatActivity; + +import android.content.Intent; +import android.os.Bundle; +import android.os.Handler; +import android.os.Looper; +import android.util.Log; +import android.view.Display; +import android.view.View; +import android.widget.EditText; +import android.widget.ProgressBar; +import android.widget.TextView; +import android.widget.Toast; + +import com.huawei.noah.bert.AppTokenizer; +import com.huawei.noah.bert.PredictionModel; +import com.huawei.noah.databinding.ActivityMainBinding; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.TreeMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +public class MainActivity extends AppCompatActivity implements View.OnClickListener { + private ActivityMainBinding binding; + private EditText content; + private EditText question; + private TextView answer; + private static final String VOCAB = "vocab.txt"; + private static final String MODEL = "bert_squad_10_f32.bolt"; + private String modelPath; + private AppTokenizer appTokenizer; + private ProgressBar progressBar; + + private ExecutorService executorService; + @Override protected void onCreate(Bundle savedInstanceState) + { + super.onCreate(savedInstanceState); + + binding = ActivityMainBinding.inflate(getLayoutInflater()); + setContentView(binding.getRoot()); + + executorService = Executors.newFixedThreadPool(1); + content = binding.content; + question = binding.question; + answer = binding.answer; + progressBar = binding.progress; + + findViewById(R.id.demo1).setOnClickListener(this); + findViewById(R.id.demo2).setOnClickListener(this); + findViewById(R.id.ask_button).setOnClickListener(this); + + String vocab = getCacheDir() + File.separator + VOCAB; + modelPath = getCacheDir() + File.separator + MODEL; + + try { + copyAssetResource2File(VOCAB, vocab); + copyAssetResource2File(MODEL, modelPath); + } catch (IOException e) { + e.printStackTrace(); + } + + appTokenizer = new AppTokenizer(vocab); + } + + private void copyAssetResource2File(String assetsFile, String outFile) throws IOException + { + File outF = new File(outFile); + if (outF.exists()) + return; + InputStream is = this.getAssets().open(assetsFile); + FileOutputStream fos = new FileOutputStream(outF); + int byteCount; + byte[] buffer = new byte[1024]; + while ((byteCount = is.read(buffer)) != -1) { + fos.write(buffer, 0, byteCount); + } + fos.flush(); + is.close(); + fos.close(); + outF.setReadable(true); + } + + @Override public void onClick(View v) + { + switch (v.getId()) { + case R.id.ask_button: { + if (content.getText().toString().length() == 0) { + Toast + .makeText( + getApplicationContext(), "Content can not be null", Toast.LENGTH_LONG) + .show(); + return; + } else if (question.getText().toString().length() == 0) { + Toast + .makeText( + getApplicationContext(), "Question can not be null", Toast.LENGTH_LONG) + .show(); + return; + } + + progressBar.setVisibility(View.VISIBLE); + executorService.submit(new Runnable() { + @Override public void run() + { + float[][] tokenizers = appTokenizer.runTokenizer( + content.getText().toString(), question.getText().toString()); + int[] inputCActual = { + tokenizers[0].length, tokenizers[1].length, tokenizers[2].length}; + int inputNum = 3; + int outputNum = 2; + String[] inputName = {"input_ids:0", "input_mask:0", "segment_ids:0"}; + String[] outputName = {"unstack:0", "unstack:1"}; + int[] inputN = {1, 1, 1}; + int[] inputCMax = {256, 256, 256}; + int[] inputH = {1, 1, 1}; + int[] inputW = {1, 1, 1}; + DataType[] inputDatatype = {DataType.INT32, DataType.INT32, DataType.INT32}; + DataFormat[] inputDataFormat = { + DataFormat.NORMAL, DataFormat.NORMAL, DataFormat.NORMAL}; + BoltModel boltModel = new BoltModel(modelPath, + AffinityType.CPU_HIGH_PERFORMANCE, inputNum, inputName, inputN, + inputCMax, inputH, inputW, inputDatatype, inputDataFormat, outputNum, + outputName); + BoltResult boltResult = boltModel.run(inputNum, inputName, inputN, + inputCActual, inputH, inputW, inputDatatype, inputDataFormat, + tokenizers); + float[][] result = boltResult.getResultData(); + String resultStr = getResultAnswer(result); + boltModel.destructor(); + doOnUiCode(resultStr); + } + }); + + } break; + case R.id.demo1: { + content.setText(getString(R.string.Demo1)); + question.setText(getString(R.string.Ques1)); + answer.setText(""); + } break; + case R.id.demo2: { + content.setText(getString(R.string.Demo2)); + question.setText(getString(R.string.Ques2)); + answer.setText(""); + } break; + + default: + break; + } + } + + private void doOnUiCode(String string) + { + Handler uiThread = new Handler(Looper.getMainLooper()); + uiThread.post(new Runnable() { + @Override public void run() + { + answer.setText(string); + progressBar.setVisibility(View.GONE); + } + }); + } + + private String getResultAnswer(float[][] result) + { + ArrayList start_index = getBestIndexs(result[0], 20); + ArrayList end_index = getBestIndexs(result[1], 20); + ArrayList predictionModels = new ArrayList<>(); + for (int start : start_index) { + for (int end : end_index) { + predictionModels.add( + new PredictionModel(start, end, result[0][start], result[1][end])); + } + } + Collections.sort(predictionModels, new Comparator() { + @Override public int compare(PredictionModel o1, PredictionModel o2) + { + if ((o1.start_logit + o1.end_logit) >= (o2.start_logit + o2.end_logit)) { + return -1; + } else + return 1; + } + }); + + PredictionModel predictionModel = predictionModels.get(2); + String tok = ""; + for (int i = predictionModel.start; i <= predictionModel.end; i++) { + if (appTokenizer.features_.get(i).contains("##")) { + String s = appTokenizer.features_.get(i).substring( + appTokenizer.features_.get(i).indexOf("##") + 2); + tok += s; + } else { + if (i == predictionModel.start) { + tok += appTokenizer.features_.get(i); + } else { + tok += " " + appTokenizer.features_.get(i); + } + } + } + return tok; + } + + private ArrayList getBestIndexs(float[] datas, int bestSize) + { + ArrayList results = new ArrayList<>(); + Map unstack = new TreeMap(new Comparator() { + @Override public int compare(Float o1, Float o2) + { + return o2.compareTo(o1); + } + }); + + for (int i = 0; i < 256; i++) { + unstack.put(datas[i], i); + } + + int index = 0; + for (Iterator i = unstack.values().iterator(); i.hasNext();) { + if (index >= bestSize) + break; + Object obj = i.next(); + results.add((int)obj); + index++; + } + return results; + } +} diff --git a/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/AppTokenizer.java b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/AppTokenizer.java new file mode 100644 index 00000000..d42df66d --- /dev/null +++ b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/AppTokenizer.java @@ -0,0 +1,119 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +package com.huawei.noah.bert; + +import android.util.Log; + +import java.util.ArrayList; +import java.util.List; + +public class AppTokenizer { + private static final String TAG = "AppTokenizer"; + + private int maxSeqLength; + private int tokenSize; + private List tokens; + private FullTokenizer tokenizer; + public List features_ = new ArrayList<>(); + + public AppTokenizer(String vocab) + { + this.maxSeqLength = 256; + tokenizer = new FullTokenizer(vocab); + } + + public float[][] runTokenizer(String paragraph, String question) + { + List paragraph_tokens = tokenizer.tokenize(paragraph); + List feture1 = tokenizer.getFeaturesList(); + List question_tokens = tokenizer.tokenize(question); + List feture2 = tokenizer.getFeaturesList(); + + return getExampleSingle(tokenizer, paragraph_tokens, question_tokens, feture1, feture2); + } + + private float[][] getExampleSingle(FullTokenizer tokenizer, + List paragraph_tokens, + List question_tokens, + List feature1, + List feature2) + { + tokens = new ArrayList<>(); + List segmentIds = new ArrayList<>(); + List positions = new ArrayList(); + + features_.clear(); + features_.add("[CLS]"); + tokens.add("[CLS]"); + + for (int i = 0; i < question_tokens.size(); i++) { + tokens.add(question_tokens.get(i)); + features_.add(feature2.get(i)); + } + tokens.add("[SEP]"); + features_.add("[SEP]"); + + for (int i = 0; i < paragraph_tokens.size(); i++) { + tokens.add(paragraph_tokens.get(i)); + features_.add(feature1.get(i)); + } + tokens.add("[SEP]"); + features_.add("[SEP]"); + + List inputIds = tokenizer.convertTokensToIds(tokens); + for (int i = 0; i < maxSeqLength; i++) { + if (i < inputIds.size()) { + if (i < question_tokens.size() + 2) { + segmentIds.add(0); + } else { + segmentIds.add(1); + } + positions.add(1); + } else { + inputIds.add(0); + segmentIds.add(0); + positions.add(0); + } + } + + float[][] outputs = new float[3][maxSeqLength]; + for (int i = 0; i < inputIds.size(); i++) { + outputs[0][i] = inputIds.get(i); + } + for (int i = 0; i < positions.size(); i++) { + outputs[1][i] = positions.get(i); + } + + for (int i = 0; i < segmentIds.size(); i++) { + outputs[2][i] = segmentIds.get(i); + } + + tokenSize = paragraph_tokens.size() + 2 + question_tokens.size() + 1; + Log.i(TAG, "getExampleSingle: tokenSize = " + tokenSize); + return outputs; + } + + public String getTokens() + { + StringBuilder stringBuilder = new StringBuilder(); + for (int i = 1; i < tokens.size() - 1; i++) { + stringBuilder.append(tokens.get(i)).append(" "); + } + return stringBuilder.toString(); + } + + public int getTokenSize() + { + return tokenSize; + } +} diff --git a/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/BasicTokenizer.java b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/BasicTokenizer.java new file mode 100644 index 00000000..0abf15ba --- /dev/null +++ b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/BasicTokenizer.java @@ -0,0 +1,162 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +package com.huawei.noah.bert; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class BasicTokenizer { + public List tokenize(String text) + { + String cleanText = cleanText(text); + + String chineseTokens = tokenizeChineseChars(cleanText); + + List origTokens = whiteSpaceTokenize(chineseTokens); + + String str = ""; + for (String token : origTokens) { + List list = runSplitOnPunc(token); + for (int i = 0; i < list.size(); i++) { + str += list.get(i) + " "; + } + } + + List resTokens = whiteSpaceTokenize(str); + + return resTokens; + } + + private List runSplitOnPunc(String token) + { + List> result = new ArrayList>(); + + int length = token.length(); + int i = 0; + boolean startNewWord = true; + while (i < length) { + char c = token.charAt(i); + if (isPunctuation(c)) { + List list = Arrays.asList(c); + result.add(list); + startNewWord = true; + } else { + if (startNewWord) { + result.add(new ArrayList()); + } + startNewWord = false; + result.get(result.size() - 1).add(c); + } + i += 1; + } + + List res = new ArrayList(); + for (int j = 0; j < result.size(); j++) { + String str = ""; + for (int k = 0; k < result.get(j).size(); k++) { + str += result.get(j).get(k); + } + res.add(str); + } + return res; + } + + private boolean isPunctuation(char c) + { + if ((c >= 33 && c <= 47) || (c >= 58 && c <= 64) || (c >= 91 && c <= 96) || + (c >= 123 && c <= 126)) { + return true; + } + + if (c == '“' || c == '”' || c == '、' || c == '《' || c == '》' || c == '。' || c == ';' || + c == '【' || c == '】') { + return true; + } + + return false; + } + + private List whiteSpaceTokenize(String text) + { + List result = new ArrayList(); + + text = text.trim(); + if (null == text) { + return result; + } + String[] tokens = text.split(" "); + result = Arrays.asList(tokens); + + return result; + } + + private String tokenizeChineseChars(String cleanText) + { + StringBuffer outStrBuf = new StringBuffer(); + + for (int i = 0; i < cleanText.length(); i++) { + char c = cleanText.charAt(i); + if (isChineseChar(c)) { + outStrBuf.append(" "); + outStrBuf.append(c); + outStrBuf.append(" "); + } else { + outStrBuf.append(c); + } + } + + return outStrBuf.toString(); + } + + private boolean isChineseChar(char c) + { + String s = String.valueOf(c); + String regex = "[\u4e00-\u9fa5]"; + Pattern p = Pattern.compile(regex); + + Matcher m = p.matcher(s); + return m.matches(); + } + + private String cleanText(String text) + { + StringBuffer outStrBuf = new StringBuffer(""); + + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + if (isWhiteSpace(c)) { + outStrBuf.append(" "); + } else { + outStrBuf.append(c); + } + } + return outStrBuf.toString(); + } + + private boolean isWhiteSpace(char c) + { + if (c == ' ' || c == '\t' || c == '\n' || c == '\r') { + return true; + } + + return false; + } + + public static void main(String[] args) + { + System.out.print("hello world"); + } +} diff --git a/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/FullTokenizer.java b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/FullTokenizer.java new file mode 100644 index 00000000..1e87c218 --- /dev/null +++ b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/FullTokenizer.java @@ -0,0 +1,95 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +package com.huawei.noah.bert; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +public class WordpieceTokenizer { + private Map vocab; + private String unkToken = "[UNK]"; + private int maxInputCharsPerWord = 200; + private List featuresList = new ArrayList<>(); + + public WordpieceTokenizer(Map vocab) + { + this.vocab = vocab; + } + + /* + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + */ + public List tokenize(String text) + { + String lowText = text.toLowerCase(); + + featuresList.clear(); + List outputTokens = new ArrayList(); + + int length = lowText.length(); + if (length > this.maxInputCharsPerWord) { + outputTokens.add(this.unkToken); + } + + boolean isBad = false; + int start = 0; + List subTokens = new ArrayList(); + List featureTokens = new ArrayList(); + + while (start < length) { + int end = length; + String curSubStr = null; + String featureSubStr = null; + while (start < end) { + String subStr = lowText.substring(start, end); + String featureStr = text.substring(start, end); + if (start > 0) { + subStr = "##" + subStr; + featureStr = "##" + featureStr; + } + if (this.vocab.containsKey(subStr)) { + curSubStr = subStr; + featureSubStr = featureStr; + break; + } + end -= 1; + } + if (null == curSubStr) { + isBad = true; + break; + } + subTokens.add(curSubStr); + featureTokens.add(featureSubStr); + start = end; + } + + if (isBad) { + outputTokens.add(this.unkToken); + featuresList.add(this.unkToken); + } else { + outputTokens.addAll(subTokens); + featuresList.addAll(featureTokens); + } + + return outputTokens; + } + + public List getFeaturesList() + { + return featuresList; + } +} diff --git a/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/PredictionModel.java b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/PredictionModel.java new file mode 100644 index 00000000..eac02c50 --- /dev/null +++ b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/PredictionModel.java @@ -0,0 +1,27 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +package com.huawei.noah.bert; + +public class PredictionModel { + public int start; + public int end; + public float start_logit; + public float end_logit; + + public PredictionModel(int aStart, int aEnd, float startLogit, float endLogit){ + start=aStart; + end=aEnd; + start_logit=startLogit; + end_logit=endLogit; + } +} diff --git a/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/WordpieceTokenizer.java b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/WordpieceTokenizer.java new file mode 100644 index 00000000..701076a2 --- /dev/null +++ b/kit/Android/ReadingComprehension/app/src/main/java/com/huawei/noah/bert/WordpieceTokenizer.java @@ -0,0 +1,94 @@ +// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved. + +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +package com.huawei.noah.bert; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +public class WordpieceTokenizer { + + private Map vocab; + private String unkToken = "[UNK]"; + private int maxInputCharsPerWord = 200; + private List featuresList=new ArrayList<>(); + + public WordpieceTokenizer(Map vocab){ + this.vocab = vocab; + } + + /* + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + */ + public List tokenize(String text){ + + String lowText=text.toLowerCase(); + + featuresList.clear(); + List outputTokens = new ArrayList(); + + int length = lowText.length(); + if(length > this.maxInputCharsPerWord){ + outputTokens.add(this.unkToken); + } + + boolean isBad = false; + int start = 0; + List subTokens = new ArrayList(); + List featureTokens = new ArrayList(); + + while(start < length){ + int end = length; + String curSubStr = null; + String featureSubStr = null; + while(start < end){ + String subStr = lowText.substring(start, end); + String featureStr = text.substring(start, end); + if(start > 0){ + subStr = "##" + subStr; + featureStr = "##" + featureStr; + } + if(this.vocab.containsKey(subStr)){ + curSubStr = subStr; + featureSubStr = featureStr; + break; + } + end -= 1; + } + if(null == curSubStr){ + isBad = true; + break; + } + subTokens.add(curSubStr); + featureTokens.add(featureSubStr); + start = end; + } + + if(isBad){ + outputTokens.add(this.unkToken); + featuresList.add(this.unkToken); + }else{ + outputTokens.addAll(subTokens); + featuresList.addAll(featureTokens); + } + + return outputTokens; + } + + public List getFeaturesList(){ + return featuresList; + } +} diff --git a/kit/Android/ReadingComprehension/app/src/main/res/drawable-v24/ic_launcher_foreground.xml b/kit/Android/ReadingComprehension/app/src/main/res/drawable-v24/ic_launcher_foreground.xml new file mode 100644 index 00000000..2b068d11 --- /dev/null +++ b/kit/Android/ReadingComprehension/app/src/main/res/drawable-v24/ic_launcher_foreground.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/kit/Android/ReadingComprehension/app/src/main/res/drawable/ic_launcher_background.xml b/kit/Android/ReadingComprehension/app/src/main/res/drawable/ic_launcher_background.xml new file mode 100644 index 00000000..07d5da9c --- /dev/null +++ b/kit/Android/ReadingComprehension/app/src/main/res/drawable/ic_launcher_background.xml @@ -0,0 +1,170 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/kit/Android/ReadingComprehension/app/src/main/res/layout/activity_main.xml b/kit/Android/ReadingComprehension/app/src/main/res/layout/activity_main.xml new file mode 100644 index 00000000..5a2d6be6 --- /dev/null +++ b/kit/Android/ReadingComprehension/app/src/main/res/layout/activity_main.xml @@ -0,0 +1,146 @@ + + + + + + + + + + + + + + + +