diff --git a/.Doxyfile b/.Doxyfile
index c3386af2..9dbfe4ba 100644
--- a/.Doxyfile
+++ b/.Doxyfile
@@ -771,8 +771,11 @@ WARN_LOGFILE =
# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
# Note: If this tag is empty the current directory is searched.
-INPUT = ./inference/engine/api ./inference/flow/include/flow.h \
-./common/uni/include/task.h ./inference/flow/src/flow.proto
+INPUT = ./inference/engine/api/c \
+./inference/engine/api/java \
+./inference/flow/include/flow.h ./common/uni/include/task.h ./inference/flow/src/flow.proto \
+./training/api/training/api/API.h \
+./training/demos/common/training.h
# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
diff --git a/.gitignore b/.gitignore
index 587e6768..016a22bc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,6 +53,8 @@ kit/Android/SimpleImageClassification/app/src/main/java
kit/iOS/SimpleImgClassfication/libbolt
kit/Android/Semantics/app/src/main/java
kit/Android/Semantics/app/src/main/assets/
+kit/Android
+kit/iOS
final_combinations.txt
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1d06678f..9ca700b2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,9 @@ if (NOT "$ENV{JNI_ROOT}" STREQUAL "")
set(USE_JNI ON)
endif(JNI_FOUND)
endif ()
+if (USE_SECURE_C)
+ find_package(SecureC)
+endif ()
if (BUILD_TEST)
find_package(jpeg)
if (EXISTS ${OpenCV_CMAKE_PATH})
@@ -33,7 +36,7 @@ if (BUILD_TEST)
endif (BUILD_TEST)
add_subdirectory(common)
-if (USE_CAFFE OR USE_ONNX OR USE_TFLITE OR USE_TENSORFLOW)
+if (USE_CAFFE OR USE_ONNX OR USE_TFLITE OR USE_TENSORFLOW OR USE_MINDSPORE)
add_subdirectory(model_tools)
endif()
add_subdirectory(compute)
@@ -45,6 +48,13 @@ message(STATUS "CXXFLAGS: ${CMAKE_CXX_FLAGS}")
add_custom_target(bolt_library ALL
COMMAND bash ./scripts/build_light_bolt.sh ${CMAKE_SYSTEM_NAME} ${CMAKE_CXX_COMPILER} ${CMAKE_AR} ${CMAKE_STRIP} ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_OUTPUT_EXTENSION} ${CMAKE_SHARED_LIBRARY_PREFIX} ${CMAKE_SHARED_LIBRARY_SUFFIX} ${CMAKE_STATIC_LIBRARY_PREFIX} ${CMAKE_STATIC_LIBRARY_SUFFIX} ${CMAKE_BINARY_DIR}
WORKING_DIRECTORY ${BOLT_ROOT})
+if (USE_TRAINING)
+ set(TRAINING_BUILD_C_API ON)
+ set(TRAINING_BUILD_DEMO ON)
+ add_subdirectory(training)
+ add_dependencies(Raul blas_enhance uni)
+ add_dependencies(Raul blas_enhance_static uni_static)
+endif (USE_TRAINING)
add_dependencies(bolt_library engine model_spec tensor image blas_enhance uni)
add_dependencies(bolt_library engine_static model_spec_static tensor_static image_static blas_enhance_static uni_static)
@@ -70,11 +80,30 @@ endif ()
enable_testing()
find_program (BASH_PROGRAM bash)
if (BASH_PROGRAM AND USE_GENERAL)
+ file(GLOB CPUINFO_CMAKE_FILE $ENV{BOLT_ROOT}/common/cmakes/cpuinfo.cmake ${BOLT_ROOT}/common/cmakes/cpuinfo.cmake)
+ include(${CPUINFO_CMAKE_FILE})
set(parameters --host_dir=${CMAKE_INSTALL_PREFIX})
if (ANDROID)
set(parameters ${parameters} -d android --device_dir=/data/local/tmp/uldra)
elseif("${CMAKE_HOST_SYSTEM_PROCESSOR}" STREQUAL "${CMAKE_SYSTEM_PROCESSOR}" AND "${CMAKE_HOST_SYSTEM}" MATCHES "${CMAKE_SYSTEM_NAME}*")
- set(parameters ${parameters} -d host)
+ if ("${CMAKE_HOST_SYSTEM_PROCESSOR}" STREQUAL "aarch64" OR "${CMAKE_HOST_SYSTEM_PROCESSOR}" STREQUAL "armv7")
+ set(parameters ${parameters} -d host)
+ elseif (USE_X86)
+ set(x86_test ${cpuinfo_avx2})
+ if (USE_INT8)
+ set(x86_test ${cpuinfo_avx512})
+ endif ()
+ if (USE_AVX512_VNNI)
+ set(x86_test ${cpuinfo_avx512_vnni})
+ endif()
+ if (x86_test)
+ set(parameters ${parameters} -d host)
+ else ()
+ set(parameters ${parameters} -d unknown)
+ endif ()
+ else ()
+ set(parameters ${parameters} -d unknown)
+ endif()
else()
set(parameters ${parameters} -d unknown)
endif()
diff --git a/README.md b/README.md
index 9b37e130..995733cd 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,8 @@
---
[![License: MIT](docs/images/license-mit.png)](https://opensource.org/licenses/MIT)
-[Bolt](https://huawei-noah.github.io/bolt/) is a light-weight library for deep learning. Bolt, as a universal deployment tool for all kinds of neural networks, aims to minimize the inference runtime as much as possible.
+[Bolt](https://huawei-noah.github.io/bolt/) is a light-weight library for deep learning.
+Bolt, as a universal deployment tool for all kinds of neural networks, aims to automate the deployment pipeline and achieve extreme acceleration.
Bolt has been widely deployed and used in many departments of HUAWEI company, such as 2012 Laboratory, CBG and HUAWEI Product Lines.
If you have questions or suggestions, you can submit issue. **QQ群: 833345709**
@@ -11,7 +12,7 @@ If you have questions or suggestions, you can submit issue. **QQ群: 833345709**
- **High Performance:** **15%+** faster than existing open source acceleration libraries.
- **Rich Model Conversion:** support Caffe, ONNX, TFLite, Tensorflow.
- **Various Inference Precision:** support FP32, FP16, INT8, 1-BIT.
-- **Multiple platforms:** ARM CPU(v7, v8, v8.2), Mali GPU, Qualcomm GPU, X86 CPU(AVX2, AVX512)
+- **Multiple platforms:** ARM CPU(v7, v8, v8.2+), Mali GPU, Qualcomm GPU, X86 CPU(AVX2, AVX512)
- **Bolt is the first to support NLP and also supports common CV applications.**
- **Minimize ROM/RAM**
- Rich Graph Optimization
@@ -23,30 +24,42 @@ If you have questions or suggestions, you can submit issue. **QQ群: 833345709**
# Building Status
---
-Kinds of choices are provided for the compilation of bolt. Please make a suitable choice depending on your environment.
-
-| target platform | build command | Linux | Windows | MacOS |
-| -------------------- | -------------------------------------------- | ----- | ------- | ----- |
-| Android(armv7) | ./install.sh --target=android-armv7 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-armv7) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-armv7) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-armv7) |
-| Android(armv8+gpu) | ./install.sh --target=android-aarch64 --gpu | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-armv8) |
-| Android(x86_64) | ./install.sh --target=android-x86_64 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-x86_64)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-x86_64) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-x86_64)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-x86_64) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-x86_64)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-x86_64) |
-| iOS(armv7) | ./install.sh --target=ios-armv7 | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-ios-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-ios-armv7) |
-| iOS(armv8) | ./install.sh --target=ios-aarch64 | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-ios-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-ios-armv8) |
-| Linux(X86_64) | ./install.sh --target=linux-x86_64 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86) | / | / |
-| Linux(x86_64_avx2) | ./install.sh --target=linux-x86_64_avx2 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86-avx2) | / | / |
-| Windows(X86_64) | ./install.sh --target=windows-x86_64 | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-x86) | / |
-| Windows(x86_64_avx2) | ./install.sh --target=windows-x86_64_avx2 | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-x86-avx2) | / |
-| MacOS(X86_64) | ./install.sh --target=macos-x86_64 | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-x86) |
-| MacOS(x86_64_avx2) | ./install.sh --target=macos-x86_64_avx2 | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-x86-avx2) |
-
-*NOTE: Bolt defaultly link static library, This may cause some problem on some platforms. You can use --shared option to link shared library.*
+There are some common used platform for inference. More targets can be seen from [scripts/target.sh](scripts/target.sh). Please make a suitable choice depending on your environment.
+If you want to build on-device training module, you can add **--train** option.
+If you want to use multi-threads parallel, you can add **--openmp** option.
+
+*Bolt defaultly link static library, This may cause some problem on some platforms. You can use --shared option to link shared library.*
+
+| target platform | precision | build command | Linux | Windows | MacOS |
+| ---------------------- | ------------------ | ---------------------------------------------------- | ----- | ------- | ----- |
+| Android(armv7) | fp32,int8 | ./install.sh --target=android-armv7 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-armv7) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-armv7) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-armv7) |
+| Android(armv8) | fp32,int8 | ./install.sh --target=android-aarch64 --fp16=off | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-armv8) |
+| Android(armv8.2+) | fp32,fp16,int8,bnn | ./install.sh --target=android-aarch64 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-armv8) |
+| Android(gpu) | fp16 | ./install.sh --target=android-aarch64 --gpu | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-armv8) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-armv8) |
+| Android(x86_64) | fp32,int8 | ./install.sh --target=android-x86_64 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-android-x86_64)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-android-x86_64) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-android-x86_64)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-android-x86_64) | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-android-x86_64)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-android-x86_64) |
+| iOS(armv7) | fp32,int8 | ./install.sh --target=ios-armv7 | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-ios-armv7)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-ios-armv7) |
+| iOS(armv8) | fp32,int8 | ./install.sh --target=ios-aarch64 --fp16=off | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-ios-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-ios-armv8) |
+| iOS(armv8.2+) | fp32,fp16,int8,bnn | ./install.sh --target=ios-aarch64 | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-ios-armv8)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-ios-armv8) |
+| Linux(armv7) | fp32,int8 | ./install.sh --target=linux-armv7_blank | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86) | / | / |
+| Linux(armv8) | fp32,int8 | ./install.sh --target=linux-aarch64_blank --fp16=off | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86) | / | / |
+| Linux(armv8.2+) | fp32,fp16,int8,bnn | ./install.sh --target=linux-aarch64_blank | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86) | / | / |
+| Linux(x86_64) | fp32,int8 | ./install.sh --target=linux-x86_64 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86) | / | / |
+| Linux(x86_64_avx2) | fp32 | ./install.sh --target=linux-x86_64_avx2 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86-avx2) | / | / |
+| Linux(x86_64_avx512) | fp32,int8 | ./install.sh --target=linux-x86_64_avx512 | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/linux-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Alinux-x86-avx2) | / | / |
+| Windows(x86_64) | fp32,int8 | ./install.sh --target=windows-x86_64 | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-x86) | / |
+| Windows(x86_64_avx2) | fp32 | ./install.sh --target=windows-x86_64_avx2 | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-x86-avx2) | / |
+| Windows(x86_64_avx512) | fp32,int8 | ./install.sh --target=windows-x86_64_avx512 | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/windows-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Awindows-x86-avx2) | / |
+| MacOS(armv8.2+) | fp32,fp16,int8,bnn | ./install.sh --target=macos-aarch64 | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-x86) |
+| MacOS(x86_64) | fp32,int8 | ./install.sh --target=macos-x86_64 | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-x86)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-x86) |
+| MacOS(x86_64_avx2) | fp32 | ./install.sh --target=macos-x86_64_avx2 | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-x86-avx2) |
+| MacOS(x86_64_avx512) | fp32,int8 | ./install.sh --target=macos-x86_64_avx512 | / | / | [![Build Status](https://img.shields.io/github/workflow/status/huawei-noah/bolt/macos-x86-avx2)](https://github.com/huawei-noah/bolt/actions?query=workflow%3Amacos-x86-avx2) |
# Quick Start
---
Two steps to get started with bolt.
-1. Conversion: use **[X2bolt](model_tools/tools/X2bolt/X2bolt.cpp)** to convert your model from caffe,onnx,tflite or tensorflow to .bolt;
+1. Conversion: use **[X2bolt](model_tools/tools/X2bolt/X2bolt.cpp)** to convert your model from caffe, onnx, tflite or tensorflow to .bolt file;
2. Inference: run **[benchmark](inference/examples/benchmark/benchmark.cpp)** with .bolt and data to get the inference result.
@@ -56,9 +69,10 @@ Two steps to get started with bolt.
Here we show some interesting and useful applications in bolt.
-| Face Detection | ASR | Semantics Analysis | Image Classification
-| :------: | :------: | :------: |:------:
-| [![face_detection](docs/images/20_bolt_face_detection.gif)](inference/examples/ultra_face) demo_link: [face detection](inference/examples/ultra_face) | [![asr](docs/images/ChineseSpeechRecognition.gif)]() demo_link: [asr](inference/examples/automatic_speech_recognition) | [![semantics analysis](docs/images/SemanticsAnalysis.gif)]() demo_link: [semantics analysis](kit/Android/Semantics) | [![image_classification](docs/images/ImageClassification.gif)]() demo_link: [image classification](inference/examples/image_classification)
+| Face Detection | ASR | Semantics Analysis | Image Classification | Reading Comprehension |
+| :------: | :------: | :------: | :------: | :------: |
+| ![face_detection](docs/images/20_bolt_face_detection.gif) [android](kit/Android/FaceDetection) [ios](kit/iOS/FaceDetection) [exe](inference/examples/ultra_face) | ![asr](docs/images/ChineseSpeechRecognition.gif) [android](kit/Android/ChineseSpeechRecognition) [ios](kit/iOS/ChineseSpeechRecognition) | ![semantics analysis](docs/images/SemanticsAnalysis.gif) [android](kit/Android/Semantics) | ![image_classification](docs/images/ImageClassification.gif) [android](kit/Android/SimpleImageClassification) [ios](kit/iOS/SimpleImageClassification) | ![reading_comprehension](docs/images/ReadingComprehension.gif) [android](kit/Android/ReadingComprehension) |
+
# Verified Networks
---
Bolt has shown its high performance in the inference of common CV and NLP neural networks. Some of the representative networks that we have verified are listed below. You can find detailed benchmark information in [docs/BENCHMARK.md](docs/BENCHMARK.md).
@@ -81,16 +95,32 @@ Two steps to get started with bolt.
BiRealNet,
ReActNet,
Ghostnet,
- SSD, Yolov3, Pointnet, ViT, TNT ...
+ unet,
+ LCNet, Pointnet,
+ hair-segmentation,
+ duc,
+ fcn,
+ retinanet,
+ SSD,
+ Faster-RCNN,
+ Mask-RCNN,
+ Yolov2,
+ Yolov3,
+ Yolov4,
+ Yolov5,
+ ViT, TNT ...
NLP
|
- Bert,
- Albert, Neural Machine Translation, Text To Speech, Automatic Speech Recognition,
- Tdnn ...
+ | Bert, Albert, Tinybert, Neural Machine Translation, Text To Speech(Tactron,Tactron2,FastSpeech+hifigan,melgan), Automatic Speech Recognition, DFSMN, Conformer,
+ Tdnn,
+ FRILL,
+ T5,
+ GPT-2,
+ Roberta ...
|
@@ -111,18 +141,23 @@ Two steps to get started with bolt.
- More models than these mentioned above are supported, users are encouraged to further explore.
+ More models than these mentioned above are supported, users are encouraged to further explore.
+
+# On-Device Training
+---
+On-Device Training has come, it's a beta vesion which supports [Lenet](./training/demos/lenet_demo/), [Mobilenet_v1](./training/demos/mobilenet_v1_demo) and [Resnet18](./training/demos/resnet18_demo) for training on the embedded devices and servers. Want more details of on-device training in bolt? Get with the official training [tutorial](./training/TUTORIAL.md).
# Documentations
---
Everything you want to know about bolt is recorded in the detailed documentations stored in [docs](docs).
-- [How to install bolt with different compilers](docs/INSTALL.md).
-- [How to use bolt to inference your ML models.](docs/USER_HANDBOOK.md)
-- [How to develop bolt to customize more models.](docs/DEVELOPER.md)
+- [How to install bolt with different compilers?](docs/INSTALL.md).
+- [How to use bolt to inference your ML models?](docs/USER_HANDBOOK.md)
+- [How to develop bolt to customize more models?](docs/DEVELOPER.md)
- [Operators documentation](docs/OPERATORS.md)
- [Benchmark results on some universal models.](docs/BENCHMARK.md)
-- [How to build demo/example with kit.](docs/KIT.md)
+- [How to visualise/protect bolt model?](docs/USER_HANDBOOK.md#model-visualization)
+- [How to build demo/example with kit?](docs/KIT.md)
- [Frequently Asked Questions(FAQ)](docs/FAQ.md)
# Articles
@@ -133,6 +168,7 @@ Everything you want to know about bolt is recorded in the detailed documentation
- [Bolt GPU性能优化,让上帝帮忙掷骰子](https://zhuanlan.zhihu.com/p/336218879)
- [Bolt助力HMS机器翻译,自然语言处理又下一城](https://zhuanlan.zhihu.com/p/337887620)
- [ARM CPU 1-bit推理,走向极致的道路](https://zhuanlan.zhihu.com/p/158161592)
+- [基于深度学习加速库Bolt的声音克隆技术(Voice Cloning)](https://zhuanlan.zhihu.com/p/498919929)
# 教程
---
@@ -141,7 +177,8 @@ Everything you want to know about bolt is recorded in the detailed documentation
- 情感分类: [Android Demo](https://zhuanlan.zhihu.com/p/414971037)
- 中文语音识别: [Android Demo](https://zhuanlan.zhihu.com/p/414978782), [iOS Demo](https://zhuanlan.zhihu.com/p/414981121)
- 人脸检测: [Android Demo](https://zhuanlan.zhihu.com/p/414975102), [iOS Demo](https://zhuanlan.zhihu.com/p/414971375)
-
+- 阅读理解: [Android Demo](https://zhuanlan.zhihu.com/p/498906834)
+-
# Acknowledgement
---
Bolt refers to the following projects: [caffe](https://github.com/BVLC/caffe), [onnx](https://github.com/onnx/onnx), [tensorflow](https://github.com/tensorflow/tensorflow), [ncnn](https://github.com/Tencent/ncnn), [mnn](https://github.com/alibaba/MNN), [dabnn](https://github.com/JDAI-CV/dabnn).
diff --git a/SUMMARY.md b/SUMMARY.md
new file mode 100644
index 00000000..7a4216cd
--- /dev/null
+++ b/SUMMARY.md
@@ -0,0 +1,42 @@
+# Summary
+
+* [Introduction](README.md)
+
+
+* [Architechture](docs/ARCHITECTURE.md)
+
+
+* [Operators](docs/OPERATORS.md)
+
+
+* [Install](docs/INSTALL.md)
+
+
+* [Basic Inference Usage](docs/USER_HANDBOOK.md#basic-usage)
+
+
+* [Basic On-device Training Usage](training/TUTORIAL.md)
+
+
+* [Advanced Features](docs/USER_HANDBOOK.md#advanced-features)
+
+
+* [Developer Customization](docs/DEVELOPER.md)
+
+
+* [How to Reduce GPU Initial Time](docs/REDUCE_GPU_PREPARE_TIME.md)
+
+
+* [Kit Example](docs/KIT.md)
+
+
+* [Changelog](docs/CHANGELOG.md)
+
+
+* [FAQ](docs/FAQ.md)
+
+
+* [Feedback](docs/FEEDBACK.md)
+
+
+* [Appendix](docs/IOS_USAGE.md)
diff --git a/book.json b/book.json
new file mode 100644
index 00000000..baf158d1
--- /dev/null
+++ b/book.json
@@ -0,0 +1,22 @@
+{
+ "plugins": [
+ "github",
+ "back-to-top-button",
+ "page-toc-button",
+ "insert-logo"
+ ],
+
+ "pluginsConfig": {
+ "github": {
+ "url": "https://github.com/huawei-noah/bolt"
+ },
+ "page-toc-button": {
+ "maxTocDepth": 1,
+ "minTocSize": 2
+ },
+ "insert-logo":{
+ "url":"../docs/images/LOGO.PNG",
+ "style":"background:none;max-height:100px"
+ }
+ }
+}
diff --git a/common/cmakes/FindSecureC.cmake b/common/cmakes/FindSecureC.cmake
new file mode 100644
index 00000000..72a8ed82
--- /dev/null
+++ b/common/cmakes/FindSecureC.cmake
@@ -0,0 +1,24 @@
+find_path(SecureC_INCLUDE_DIR NAMES securec.h HINTS $ENV{SecureC_ROOT}/include ${SecureC_ROOT}/include)
+
+if (USE_DYNAMIC_LIBRARY)
+ find_library(SecureC_LIBRARY NAMES securec HINTS $ENV{SecureC_ROOT}/lib ${SecureC_ROOT}/lib)
+ set(SecureC_SHARED_LIBRARY ${SecureC_LIBRARY})
+else (USE_DYNAMIC_LIBRARY)
+ find_library(SecureC_LIBRARY NAMES ${CMAKE_STATIC_LIBRARY_PREFIX}securec${CMAKE_STATIC_LIBRARY_SUFFIX} HINTS $ENV{SecureC_ROOT}/lib ${SecureC_ROOT}/lib)
+ find_library(SecureC_SHARED_LIBRARY NAMES securec HINTS $ENV{SecureC_ROOT}/lib ${SecureC_ROOT}/lib)
+endif (USE_DYNAMIC_LIBRARY)
+
+if (SecureC_INCLUDE_DIR AND SecureC_LIBRARY)
+ set(SecureC_FOUND true)
+endif (SecureC_INCLUDE_DIR AND SecureC_LIBRARY)
+
+if (SecureC_FOUND)
+ include_directories(${SecureC_INCLUDE_DIR})
+ message(STATUS "Found securec.h: ${SecureC_INCLUDE_DIR}")
+ message(STATUS "Found securec: ${SecureC_LIBRARY}")
+else (SecureC_FOUND)
+ message(FATAL_ERROR "
+FATAL: can not find securec library in /[include|lib] directory,
+ please set shell environment variable SecureC_ROOT.
+ ")
+endif (SecureC_FOUND)
diff --git a/common/cmakes/bolt.cmake b/common/cmakes/bolt.cmake
index 04308d5b..3f0378d1 100644
--- a/common/cmakes/bolt.cmake
+++ b/common/cmakes/bolt.cmake
@@ -12,6 +12,7 @@ option(USE_CAFFE "set use caffe model as input or not" OFF)
option(USE_ONNX "set use onnx model as input or not" OFF)
option(USE_TFLITE "set use tflite model as input or not" OFF)
option(USE_TENSORFLOW "set use tensorflow model as input or not" OFF)
+option(USE_MINDSPORE "set use mindspore model as input or not" OFF)
# blas_enhance tensor
option(USE_GENERAL "set use CPU serial code or not" OFF)
@@ -26,12 +27,23 @@ option(USE_INT8_WINOGRAD "set use ARM NEON INT8 winograd" ON)
option(USE_OPENMP "set use openmp to run test(tinybert) or not" OFF)
option(USE_LIBRARY_TUNING "set use algorithm tuning or not" OFF)
+option(USE_MEM_CHECK "set to use memory check or not" OFF)
+option(USE_MODEL_PRINT "set to use model print or not" ON)
+option(USE_SECURE_C "set to use Huawei Secure C or not" OFF)
+
+option(USE_TRAINING "set whether to use training or not" OFF)
option(USE_FLOW "set whether to use flow or not" OFF)
option(USE_JNI "set whether to use Java API or not" OFF)
option(BUILD_TEST "set to build unit test or not" OFF)
+include(CheckCXXCompilerFlag)
+
+if (USE_TRAINING)
+ set(ANDROID_TOOLCHAIN_PREFIX "aarch64-linux-android-")
+endif(USE_TRAINING)
+
function (set_policy)
if (POLICY CMP0074)
cmake_policy(SET CMP0074 NEW)
@@ -39,15 +51,19 @@ function (set_policy)
endfunction(set_policy)
macro (set_c_cxx_flags)
- set(COMMON_FLAGS "-W -Wextra -O3 -fPIC")
- if (NOT WIN32)
- set(COMMON_FLAGS "${COMMON_FLAGS} -fstack-protector-all")
- endif()
+ set(COMMON_FLAGS "-O3 -fPIC -fPIE")
+ # warning flag can be remove in release version
+ set(COMMON_FLAGS "${COMMON_FLAGS} -W -Wextra")
+ set(COMMON_FLAGS "${COMMON_FLAGS} -fstack-protector-all -fstack-protector-strong")
set(COMMON_FLAGS "${COMMON_FLAGS} -Wno-unused-command-line-argument -Wno-unused-parameter")
set(COMMON_FLAGS "${COMMON_FLAGS} -Wno-unused-result -Wno-deprecated-declarations -Wno-unused-variable")
if (USE_OPENMP)
set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_OPENMP -fopenmp")
+ CHECK_CXX_COMPILER_FLAG("-static-openmp" COMPILER_SUPPORTS_STATIC_OPENMP)
+ if (COMPILER_SUPPORTS_STATIC_OPENMP)
+ set(COMMON_FLAGS "${COMMON_FLAGS} -static-openmp")
+ endif ()
endif(USE_OPENMP)
if (USE_THREAD_SAFE OR USE_CAFFE OR USE_ONNX OR USE_FLOW)
@@ -99,27 +115,29 @@ macro (set_c_cxx_flags)
if (USE_INT8)
set(COMMON_FLAGS "${COMMON_FLAGS} -mavx512f")
endif (USE_INT8)
- if (USE_AVX512_VNNI)
+ if (USE_AVX512_VNNI)
set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_AVX512_VNNI")
- endif(USE_AVX512_VNNI)
+ endif(USE_AVX512_VNNI)
endif(USE_X86)
if (USE_FP32)
set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_FP32")
endif (USE_FP32)
+ if (USE_FP16)
+ set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_FP16")
+ if (USE_F16_MIX_PRECISION)
+ set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_F16_MIX_PRECISION")
+ endif (USE_F16_MIX_PRECISION)
+ endif ()
+
if (USE_INT8)
set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_INT8")
endif (USE_INT8)
if (USE_NEON)
set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_NEON")
-
if (USE_FP16)
- set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_FP16")
- if (USE_F16_MIX_PRECISION)
- set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_F16_MIX_PRECISION")
- endif (USE_F16_MIX_PRECISION)
if (USE_INT8)
if (USE_INT8_WINOGRAD)
set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_INT8_WINOGRAD")
@@ -137,9 +155,6 @@ macro (set_c_cxx_flags)
endif ()
endif (USE_INT8)
endif (USE_FP16)
- if (USE_INT8)
- set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_INT8")
- endif ()
endif(USE_NEON)
if (USE_CAFFE)
@@ -154,6 +169,21 @@ macro (set_c_cxx_flags)
if (USE_TENSORFLOW)
set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_TENSORFLOW")
endif()
+ if (USE_MINDSPORE)
+ set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_MINDSPORE")
+ endif()
+
+ if (USE_MEM_CHECK)
+ set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_MEM_CHECK")
+ endif()
+
+ if (USE_MODEL_PRINT)
+ set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_MODEL_PRINT")
+ endif()
+
+ if (USE_SECURE_C)
+ set(COMMON_FLAGS "${COMMON_FLAGS} -D_USE_SECURE_C")
+ endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${COMMON_FLAGS} -std=gnu99")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${COMMON_FLAGS} -std=c++11")
@@ -168,8 +198,14 @@ endmacro(set_c_cxx_flags)
macro (set_test_c_cxx_flags)
if (NOT USE_DYNAMIC_LIBRARY)
set(COMMON_FLAGS "${COMMON_FLAGS} -static-libstdc++")
- if (NOT "${CMAKE_HOST_SYSTEM_PROCESSOR}" STREQUAL "${CMAKE_SYSTEM_PROCESSOR}" AND "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
- set(COMMON_FLAGS "${COMMON_FLAGS} -static")
+ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+ set(COMMON_FLAGS "${COMMON_FLAGS} -static-libgcc")
+ if (NOT "${CMAKE_HOST_SYSTEM_PROCESSOR}" STREQUAL "${CMAKE_SYSTEM_PROCESSOR}")
+ set(COMMON_FLAGS "${COMMON_FLAGS} -static")
+ endif()
+ if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
+ set(COMMON_FLAGS "${COMMON_FLAGS} -static")
+ endif()
endif()
endif()
@@ -198,6 +234,7 @@ if(USE_DYNAMIC_LIBRARY)
set(model_tools_onnx_library model_tools_onnx)
set(model_tools_tflite_library model_tools_tflite)
set(model_tools_tensorflow_library model_tools_tensorflow)
+ set(model_tools_mindspore_library model_tools_mindspore)
set(model_tools_library model_tools)
set(engine_library engine)
set(flow_library flow)
@@ -213,6 +250,7 @@ else()
set(model_tools_onnx_library model_tools_onnx_static)
set(model_tools_tflite_library model_tools_tflite_static)
set(model_tools_tensorflow_library model_tools_tensorflow_static)
+ set(model_tools_mindspore_library model_tools_mindspore_static)
set(model_tools_library model_tools_static)
set(engine_library engine_static)
set(flow_library flow_static)
@@ -220,10 +258,16 @@ endif()
macro(include_uni)
include_directories(${BOLT_ROOT}/common/uni/include)
+ if (USE_SECURE_C)
+ include_directories(${SecureC_ROOT}/include)
+ endif ()
endmacro()
macro(link_uni name)
target_link_libraries(${name} ${uni_library})
+ if (USE_SECURE_C)
+ target_link_libraries(${name} ${SecureC_LIBRARY})
+ endif ()
endmacro()
macro(include_model_spec)
@@ -330,7 +374,10 @@ macro(link_model_tools name)
target_link_libraries(${name} ${model_tools_tensorflow_library})
target_link_libraries(${name} ${JSONCPP_LIBRARY})
endif()
- if(USE_CAFFE OR USE_ONNX)
+ if(USE_MINDSPORE)
+ target_link_libraries(${name} ${model_tools_mindspore_library})
+ endif()
+ if(USE_CAFFE OR USE_ONNX OR USE_MINDSPORE)
link_protobuf(${name})
endif()
link_model_spec(${name})
diff --git a/common/cmakes/cpuinfo.cmake b/common/cmakes/cpuinfo.cmake
new file mode 100644
index 00000000..87c94f77
--- /dev/null
+++ b/common/cmakes/cpuinfo.cmake
@@ -0,0 +1,16 @@
+set(CPUINFO "null")
+file(GLOB CPUINFO_FILE /proc/cpuinfo)
+if (CPUINFO_FILE)
+ exec_program(cat ARGS ${CPUINFO_FILE} OUTPUT_VARIABLE CPUINFO)
+else ()
+ message(STATUS "can not find /proc/cpuinfo")
+endif ()
+
+macro(check_cpuinfo feature)
+ string(REGEX REPLACE "^.*(${feature}).*$" "\\1" _FEATURE_THERE ${CPUINFO})
+ string(COMPARE EQUAL "${feature}" "${_FEATURE_THERE}" cpuinfo_${feature})
+endmacro()
+
+check_cpuinfo(avx2)
+check_cpuinfo(avx512)
+check_cpuinfo(avx512_vnni)
diff --git a/common/gcl/include/gcl_common.h b/common/gcl/include/gcl_common.h
index 0e0e16c2..2836d69c 100644
--- a/common/gcl/include/gcl_common.h
+++ b/common/gcl/include/gcl_common.h
@@ -130,13 +130,14 @@ inline CI8 *map_cl_error_2_string(cl_int err)
}
}
-#define map_cl_error_2_ee(err) \
- { \
- if (err == 0) \
- return SUCCESS; \
- UNI_ERROR_LOG("GCLAPI error in: File: %s Line: %d Func name is: %s GCLERROR = %s\n", \
- __FILE__, __LINE__, __FUNCTION__, map_cl_error_2_string(err)); \
- return GCL_ERROR; \
+#define map_cl_error_2_ee(err) \
+ { \
+ if (err == 0) { \
+ return SUCCESS; \
+ } else { \
+ UNI_ERROR_LOG("GCLAPI error: %s.\n", map_cl_error_2_string(err)); \
+ return GCL_ERROR; \
+ } \
}
inline EE has_dedicated_local(Device device, I32 *b)
@@ -171,6 +172,14 @@ struct GCLKernelInfo {
std::string name;
};
+typedef struct {
+ I32 algorithm;
+ U32 best_h[6];
+ U32 best_c[6];
+ U32 best_k[6];
+} ForwardRunInfoMali;
+typedef ForwardRunInfoMali *ForwardRunInfoMali_t;
+
struct GCLHandle {
Platform *platforms;
U32 numPlatform;
@@ -201,6 +210,8 @@ struct GCLHandle {
std::string deviceName;
std::map kernelMap;
std::map programMap;
+ std::map, ForwardRunInfoMali> runInfoCache;
+ std::map> kernelLSCache;
std::vector *kernelVec;
std::string curOpName;
void *kernel_source;
@@ -221,14 +232,6 @@ struct GCLHandleConfig {
typedef GCLHandleConfig *GCLHandleConfig_t;
-typedef struct {
- I32 algorithm;
- U32 best_h[6];
- U32 best_c[6];
- U32 best_k[6];
-} ForwardRunInfoMali;
-typedef ForwardRunInfoMali *ForwardRunInfoMali_t;
-
typedef struct {
GCLHandle_t handle;
GCLMemDesc_t gclmemFilterDesc;
diff --git a/common/gcl/include/gcl_func.h b/common/gcl/include/gcl_func.h
index 515c1486..93cb7130 100644
--- a/common/gcl/include/gcl_func.h
+++ b/common/gcl/include/gcl_func.h
@@ -559,7 +559,8 @@ inline EE gcl_create_kernel_with_source_map(
option = handle->common_source_opt + " " + option;
}
if (!kernel_source->get_source(sourceName, &source_ptr)) {
- UNI_ERROR_LOG("the %s doesn't exist in sourceMap\n", sourceName);
+ UNI_ERROR_LOG(
+ "the %s doesn't exist in sourceMap to find kernel %s.\n", sourceName, kernelName);
CHECK_STATUS(NULL_POINTER);
}
@@ -878,6 +879,53 @@ inline EE gcl_run_kernel(
return SUCCESS;
}
+inline EE gcl_get_kernel_name(Kernel kernel, I8 *kernelName)
+{
+ char name[256];
+ U32 len;
+ CHECK_STATUS(get_kernel_name(kernel, name, &len));
+ if (len > 256) {
+ UNI_ERROR_LOG("KernelName length %d > 256, please reset name array length\n", len);
+ CHECK_STATUS(NOT_MATCH);
+ } else {
+ UNI_STRCPY(kernelName, name);
+ }
+ return SUCCESS;
+}
+
+inline void gcl_set_kernel_ls_to_cache(GCLHandle_t handle, CI8 *kernelName, U32 gs[3], U32 ls[3])
+{
+ std::string name = kernelName;
+ name += "_" + std::to_string(gs[0]);
+ name += "_" + std::to_string(gs[1]);
+ name += "_" + std::to_string(gs[2]);
+ std::vector lsVec = {ls[0], ls[1], ls[2]};
+ if (handle->kernelLSCache.find(name) == handle->kernelLSCache.end()) {
+ handle->kernelLSCache[name] = lsVec;
+ }
+}
+
+inline bool gcl_get_kernel_ls_from_cache(GCLHandle_t handle, CI8 *kernelName, U32 gs[3], U32 ls[3])
+{
+ std::string name = kernelName;
+ name += "_" + std::to_string(gs[0]);
+ name += "_" + std::to_string(gs[1]);
+ name += "_" + std::to_string(gs[2]);
+ if (handle->kernelLSCache.find(name) != handle->kernelLSCache.end()) {
+ for (U32 i = 0; i < 3; i++) {
+ ls[i] = handle->kernelLSCache[name][i];
+ }
+ UNI_DEBUG_LOG("get kernel %s ls from cache success, gs is {%d %d %d}, ls is {%d %d %d}\n",
+ kernelName, gs[0], gs[1], gs[2], ls[0], ls[1], ls[2]);
+ return true;
+ } else {
+ UNI_DEBUG_LOG("get kernel %s ls from cache fail, try to find best ls for kernel, gs is {%d "
+ "%d %d}\n",
+ kernelName, gs[0], gs[1], gs[2]);
+ return false;
+ }
+}
+
inline U32 get_next_ls_size(U32 ls_size)
{
return (ls_size << 1);
@@ -969,16 +1017,20 @@ inline EE gcl_run_kernelVec_select_ls(GCLHandle_t handle, std::vector kerne
for (auto index : kernelIndex) {
auto kernelInfo = (*handle->kernelVec)[index];
bool needSelectLs = false;
+ U32 gs[3] = {0, 0, 0};
for (U32 i = 0; i < kernelInfo.dim; i++) {
if (kernelInfo.ls[i] == 0) {
needSelectLs = true;
- break;
}
+ gs[i] = kernelInfo.gs[i];
}
if (!needSelectLs) {
continue;
}
CHECK_STATUS(gcl_run_kernel_select_ls(handle, &kernelInfo));
+ char kernelName[256];
+ gcl_get_kernel_name(kernelInfo.kernel, kernelName);
+ gcl_set_kernel_ls_to_cache(handle, kernelName, gs, kernelInfo.ls);
(*handle->kernelVec)[index].gs[0] = kernelInfo.gs[0];
(*handle->kernelVec)[index].gs[1] = kernelInfo.gs[1];
(*handle->kernelVec)[index].gs[2] = kernelInfo.gs[2];
@@ -995,17 +1047,18 @@ inline EE gcl_infer_best_kernelVec_ls_with_map(
{
std::vector kernelIndex;
U32 len = handle->kernelVec->size();
+ bool needSaveKernelThreadInfoToMap = false;
for (U32 i = 0; i < len; i++) {
auto kernelInfo = (*handle->kernelVec)[i];
- U32 gs[3];
- U32 ls[3];
+ U32 gs[3] = {0};
+ U32 ls[3] = {0};
bool findKernelThreadInfo = false;
findKernelThreadInfo = algoMap->getKernelThreadInfoFromMap(kernelInfo.name, gs, ls);
U32 dim = (*handle->kernelVec)[i].dim;
if (findKernelThreadInfo) {
U32 cur_gs[3];
for (U32 j = 0; j < dim; j++) {
- cur_gs[j] = (*handle->kernelVec)[i].gs[j];
+ cur_gs[j] = kernelInfo.gs[j];
if (ls[j] != 0) {
cur_gs[j] = (cur_gs[j] + ls[j] - 1) / ls[j] * ls[j];
}
@@ -1014,16 +1067,29 @@ inline EE gcl_infer_best_kernelVec_ls_with_map(
}
} else {
bool noNeedInferLS = true;
+ needSaveKernelThreadInfoToMap = true;
for (U32 j = 0; j < dim; j++) {
- gs[j] = (*handle->kernelVec)[i].gs[j];
- ls[j] = (*handle->kernelVec)[i].ls[j];
+ gs[j] = kernelInfo.gs[j];
+ ls[j] = kernelInfo.ls[j];
if (ls[j] == 0) {
noNeedInferLS = false;
}
}
+ if (!noNeedInferLS) {
+ char kernelName[256];
+ gcl_get_kernel_name(kernelInfo.kernel, kernelName);
+ if (gcl_get_kernel_ls_from_cache(handle, kernelName, gs, ls)) {
+ for (U32 j = 0; j < dim; j++) {
+ (*handle->kernelVec)[i].ls[j] = ls[j];
+ }
+ noNeedInferLS = true;
+ }
+ }
if (noNeedInferLS) {
for (U32 j = 0; j < dim; j++) {
- (*handle->kernelVec)[i].gs[j] = (gs[j] + ls[j] - 1) / ls[j] * ls[j];
+ if (ls[j] > 0) {
+ (*handle->kernelVec)[i].gs[j] = (gs[j] + ls[j] - 1) / ls[j] * ls[j];
+ }
}
}
if (!noNeedInferLS) {
@@ -1032,9 +1098,11 @@ inline EE gcl_infer_best_kernelVec_ls_with_map(
}
}
CHECK_STATUS(gcl_run_kernelVec_select_ls(handle, kernelIndex));
- for (U32 i = 0; i < len; i++) {
- auto kernelInfo = (*handle->kernelVec)[i];
- algoMap->setKernelThreadInfoToMap(kernelInfo.name, kernelInfo.gs, kernelInfo.ls);
+ if (needSaveKernelThreadInfoToMap) {
+ for (U32 i = 0; i < len; i++) {
+ auto kernelInfo = (*handle->kernelVec)[i];
+ algoMap->setKernelThreadInfoToMap(kernelInfo.name, kernelInfo.gs, kernelInfo.ls);
+ }
}
return SUCCESS;
}
@@ -1387,7 +1455,7 @@ inline EE gcl_set_kernelArgs(Kernel kernel, Args... args)
inline std::string gclMemDesc2Str(GCLMemDesc desc)
{
char buff[128];
- snprintf(buff, sizeof(buff), "dt:%s memFormat:%s ", DataTypeName()[desc.dt],
+ UNI_SNPRINTF(buff, sizeof(buff), "dt:%s memFormat:%s ", DataTypeName()[desc.dt],
DataFormatName()[desc.memFormat]);
std::string descStr = buff;
descStr += "stride(";
@@ -1414,6 +1482,28 @@ inline EE gcl_get_image_size(GCLMem_t gclMem, U32 *width, U32 *height, U32 *dept
CHECK_STATUS(get_image_size(gclMem->mem, width, height, depth));
return SUCCESS;
}
+
+inline void gcl_set_runInfo_to_cache(
+ GCLHandle_t handle, std::vector flag, ForwardRunInfoMali runInfo)
+{
+ if (handle->runInfoCache.find(flag) == handle->runInfoCache.end()) {
+ handle->runInfoCache[flag] = runInfo;
+ }
+}
+
+inline bool gcl_get_runInfo_from_cache(
+ GCLHandle_t handle, std::vector flag, ForwardRunInfoMali_t runInfo)
+{
+ if (handle->runInfoCache.find(flag) != handle->runInfoCache.end()) {
+ *runInfo = handle->runInfoCache[flag];
+ UNI_DEBUG_LOG("get forward run info from cache success\n");
+ return true;
+ } else {
+ UNI_DEBUG_LOG("get forward run info from cache fail, try to find best forward run info\n");
+ return false;
+ }
+}
+
#ifdef _DEBUG
template
inline EE gcl_print_memory(GCLHandle_t handle, GCLMem_t gclMem, CI8 *gclMemName = NULL)
diff --git a/common/gcl/include/kernel.h b/common/gcl/include/kernel.h
index 5653e1b5..d2147edc 100644
--- a/common/gcl/include/kernel.h
+++ b/common/gcl/include/kernel.h
@@ -49,6 +49,22 @@ inline EE get_kernel_info(Kernel kernel, cl_kernel_info info, void **value, size
map_cl_error_2_ee(ret);
}
+inline EE get_kernel_name(Kernel kernel, char* name, U32 *len)
+{
+ if (NULL == name || NULL == len) {
+ return NULL_POINTER;
+ }
+
+ size_t lenVal;
+ cl_int ret = clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, 0, NULL, &lenVal);
+ if (ret != CL_SUCCESS) {
+ map_cl_error_2_ee(ret);
+ }
+ *len = lenVal;
+ ret = clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, lenVal, name, NULL);
+ map_cl_error_2_ee(ret);
+}
+
inline EE get_program_info_from_kernel(Kernel kernel, Program *program)
{
cl_int ret = clGetKernelInfo(kernel, CL_KERNEL_PROGRAM, sizeof(Program), program, NULL);
diff --git a/common/gcl/src/ocl_data_trans.cpp b/common/gcl/src/ocl_data_trans.cpp
index ab0aa94d..0b16d266 100644
--- a/common/gcl/src/ocl_data_trans.cpp
+++ b/common/gcl/src/ocl_data_trans.cpp
@@ -413,7 +413,7 @@ EE ocl_trans_mem(
CHECK_STATUS(NOT_MATCH);
}
CHECK_STATUS(set_padding_opt_mali(
- true, Pad_Constant, DT_F16, GCL_MEM_BUF, GCL_MEM_BUF, kernelName, &kernelOpt));
+ true, PAD_CONSTANT, DT_F16, GCL_MEM_BUF, GCL_MEM_BUF, kernelName, &kernelOpt));
CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt));
CHECK_STATUS(gcl_set_kernelArgs(kernel, sw_str, sh_str, dw_str, dh_str, 0, 0,
sw_str, sh_str, dw_str, dh_str, pl, pr, pt, pb, gs[0], gs[1], srcMem, dstMem));
@@ -494,7 +494,7 @@ EE ocl_map_mem_write(
CHECK_STATUS(NOT_MATCH);
}
CHECK_STATUS(set_padding_opt_mali(
- true, Pad_Constant, DT_F16, GCL_MEM_BUF, GCL_MEM_BUF, kernelName, &kernelOpt));
+ true, PAD_CONSTANT, DT_F16, GCL_MEM_BUF, GCL_MEM_BUF, kernelName, &kernelOpt));
CHECK_STATUS(gcl_get_kernel_from_map(handle, kernelName, &kernel, &kernelOpt));
CHECK_STATUS(gcl_set_kernelArgs(kernel, w, h, w_str, h_str, offset, 0, w, h, w_str,
h_str, pl, pr, pt, pb, gs[0], gs[1], gclMem->mem, gclMem->mem));
diff --git a/common/gcl/tools/gcl_sample/sample.cpp b/common/gcl/tools/gcl_sample/sample.cpp
index b496ad07..8d7821a9 100644
--- a/common/gcl/tools/gcl_sample/sample.cpp
+++ b/common/gcl/tools/gcl_sample/sample.cpp
@@ -10,7 +10,6 @@
// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-#ifdef _USE_FP16
#include "gcl.h"
#include "ocl_context.h"
@@ -128,22 +127,22 @@ int main()
oc_str = oc_str / (ot * on);
on_str = owh_str * oc_str;
- // F16* input_val = (F16*)malloc(inputGclDesc.byteSize);
- // F16* filter_val = (F16*)malloc(filterGclDesc.byteSize);
- // F16* bias_val = (F16*)malloc(biasGclDesc.byteSize);
- // for (U32 i = 0; i < inputGclDesc.num; i++) input_val[i] = (i % 16) * 0.1;
- // for (U32 i = 0; i < filterGclDesc.num; i++) filter_val[i] = (i % 16) * 0.1;
- // for (U32 i = 0; i < biasGclDesc.num * 4; i++) bias_val[i] = 1.0;
- // U32 size[3] = {1, 1, 1};
- // size[0] = inputGclDesc.byteSize;
- // CHECK_STATUS(gcl_trans_memory(handle, input_val, input, size, HOST_TO_DEVICE_BUF, CL_TRUE));
- // size[0] = filterGclDesc.byteSize;
- // CHECK_STATUS(gcl_trans_memory(handle, filter_val, flt, size, HOST_TO_DEVICE_BUF, CL_TRUE));
- // size[0] = biasGclDesc.num;
- // CHECK_STATUS(gcl_trans_memory(handle, bias_val, bias, size, HOST_TO_DEVICE_IMG, CL_TRUE));
+ // F16* input_val = (F16*)malloc(inputGclDesc.byteSize);
+ // F16* filter_val = (F16*)malloc(filterGclDesc.byteSize);
+ // F16* bias_val = (F16*)malloc(biasGclDesc.byteSize);
+ // for (U32 i = 0; i < inputGclDesc.num; i++) input_val[i] = (i % 16) * 0.1;
+ // for (U32 i = 0; i < filterGclDesc.num; i++) filter_val[i] = (i % 16) * 0.1;
+ // for (U32 i = 0; i < biasGclDesc.num * 4; i++) bias_val[i] = 1.0;
+ // U32 size[3] = {1, 1, 1};
+ // size[0] = inputGclDesc.byteSize;
+ // CHECK_STATUS(gcl_trans_memory(handle, input_val, input, size, HOST_TO_DEVICE_BUF, CL_TRUE));
+ // size[0] = filterGclDesc.byteSize;
+ // CHECK_STATUS(gcl_trans_memory(handle, filter_val, flt, size, HOST_TO_DEVICE_BUF, CL_TRUE));
+ // size[0] = biasGclDesc.num;
+ // CHECK_STATUS(gcl_trans_memory(handle, bias_val, bias, size, HOST_TO_DEVICE_IMG, CL_TRUE));
//
- // CHECK_STATUS(gcl_check_buf(handle, input->mem, inputGclDesc.byteSize, false, "input"));
- // CHECK_STATUS(gcl_check_buf(handle, flt->mem, filterGclDesc.byteSize, false, "filter"));
+ // CHECK_STATUS(gcl_check_buf(handle, input->mem, inputGclDesc.byteSize, false, "input"));
+ // CHECK_STATUS(gcl_check_buf(handle, flt->mem, filterGclDesc.byteSize, false, "filter"));
gcl_finish(handle);
for (U32 item_bn = 2; item_bn <= 4; item_bn++) {
for (U32 item_kn = 1; item_kn <= 2; item_kn = item_kn * 2) {
@@ -160,10 +159,10 @@ int main()
}
Kernel kernel;
- char kernelName[1024];
- sprintf(kernelName, "conv_direct_multi_batch_s1_%d%d%d%d%d", fw, fh, item_w,
- item_kn, item_bn);
- CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel));
+ std::string kernelName = std::string("conv_direct_multi_batch_s1_") +
+ std::to_string(fw) + std::to_string(fh) + std::to_string(item_w) +
+ std::to_string(item_kn) + std::to_string(item_bn);
+ CHECK_STATUS(gcl_create_kernel(handle, kernelName.c_str(), &kernel));
if (oc_str % item_kn != 0) {
continue;
}
@@ -174,7 +173,7 @@ int main()
CHECK_STATUS(gcl_set_kernelArgs(kernel, ih_str, iwh_str, ic_str, ih_off, iw_off,
oh_str, owh_str, oh_off, ow_off, ow, oc, on, sh, in_str, on_str, gs[0], gs[1],
input->mem, flt->mem, bias->mem, output->mem));
- gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName);
+ gcl_set_kernelVec(handle, kernel, dim, gs, ls, kernelName.c_str());
CHECK_STATUS(gcl_run_kernel_select_ls(handle, &kernelVec[0]));
#ifdef _DEBUG
CHECK_STATUS(gcl_run_kernelVec_timing(handle, 0, handle->kernelVec->size()));
@@ -185,12 +184,11 @@ int main()
#else
CHECK_STATUS(gcl_run_kernelVec(handle));
#endif
- // CHECK_STATUS(gcl_check_buf(handle, output->mem, outputGclDesc.byteSize, false, "output"));
- // CHECK_STATUS(gcl_fill_memory_zero(handle, output));
+ // CHECK_STATUS(gcl_check_buf(handle, output->mem, outputGclDesc.byteSize, false, "output"));
+ // CHECK_STATUS(gcl_fill_memory_zero(handle, output));
CHECK_STATUS(gcl_clean_kernelVec(handle));
gcl_finish(handle);
}
}
}
}
-#endif
diff --git a/common/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp b/common/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp
index c6f2e89d..469a238f 100644
--- a/common/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp
+++ b/common/gcl/tools/kernel_lib_compile/kernel_bin/clbinary.cpp
@@ -164,9 +164,9 @@ int main(I32 argc, I8 *argv[])
U32 srcLen = imageLen + half16Len + clcodeLen;
I8 *source = new I8[srcLen];
#ifdef CL_VERSION_1_2
- memcpy(source, imagesource, imageLen);
+ UNI_MEMCPY(source, imagesource, imageLen);
#endif
- memcpy(source + imageLen, half16source, half16Len);
+ UNI_MEMCPY(source + imageLen, half16source, half16Len);
FileStatus = LoadBinFile(FLAGS_inputFilename, source + imageLen + half16Len, clcodeLen);
if (!FileStatus) {
printf("load bin file failed\n");
diff --git a/common/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp b/common/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp
index 3b73ffba..cfd4c113 100644
--- a/common/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp
+++ b/common/gcl/tools/kernel_lib_compile/kernel_bin2char/bin2char.cpp
@@ -12,11 +12,9 @@
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#include
-#include
#include
#include
#include
-#include
int main(int argc, char *argv[])
{
@@ -55,57 +53,47 @@ int main(int argc, char *argv[])
}
binMapName = argv[3];
} else {
- printf("please input .bin name + binmapname or input .bin name + .cpp name + binmapname\n");
+ printf("[ERROR] please pass xxx.bin name + binmapname or xxx.bin name + xxx.cpp name + "
+ "binmapname.\n");
+ return 1;
}
FILE *fpbin = fopen(binFile.c_str(), "rb");
if (fpbin == NULL) {
- printf("file %s open error\n", binFile.c_str());
+ printf("[ERROR] can not open file %s.\n", binFile.c_str());
return 1;
}
struct stat f_stat;
if (stat(binFile.c_str(), &f_stat) == -1) {
- printf("file %s get size error\n", binFile.c_str());
+ printf("[ERROR] can not get file %s size.\n", binFile.c_str());
fclose(fpbin);
return 1;
}
int filelen = f_stat.st_size;
- std::stringstream templen;
- templen << filelen;
- std::string filelen_st = templen.str();
-
std::string str = "#include \"inline_" + std::string(binMapName) + ".h\"\n\nCU32 " +
- std::string(charName) + "_len = " + filelen_st + ";\nCU8 " + std::string(charName) +
- "[] = {";
-
- unsigned char charRead;
- std::string appendBuf;
-
+ std::string(charName) + "_len = " + std::to_string(filelen_st) + ";\nCU8 " +
+ std::string(charName) + "[] = {";
+ std::stringstream ss;
for (int i = 0; i < filelen; i++) {
- appendBuf.clear();
+ unsigned char c;
if (i % 20 == 0) {
- appendBuf += "\n";
+ ss << "\n";
}
- if (1 != fread(&charRead, 1, 1, fpbin)) {
- printf("file %s read error\n", binFile.c_str());
+ if (1 != fread(&c, 1, 1, fpbin)) {
+ printf("[ERROR] can not read file %s content.\n", binFile.c_str());
fclose(fpbin);
return 1;
}
- char tempstr[4];
- sprintf(tempstr, "0x%02x", charRead);
- appendBuf += std::string(tempstr);
-
+ ss << "0x" << std::hex << std::setw(2) << std::setfill('0') << i;
if (i == filelen - 1) {
} else if (i % 20 == 19) {
- appendBuf += ",";
+ ss << ",";
} else {
- appendBuf += ", ";
+ ss << ", ";
}
- str += appendBuf;
}
-
- str += "};";
+ str += ss.str() + "};";
std::ofstream file;
file.open(cppFile.c_str());
@@ -113,6 +101,5 @@ int main(int argc, char *argv[])
file.close();
fclose(fpbin);
-
return 0;
}
diff --git a/common/gcl/tools/kernel_source_compile/kernel_cl2char/cl2char.cpp b/common/gcl/tools/kernel_source_compile/kernel_cl2char/cl2char.cpp
index 2dc1cd0e..2a0d5871 100644
--- a/common/gcl/tools/kernel_source_compile/kernel_cl2char/cl2char.cpp
+++ b/common/gcl/tools/kernel_source_compile/kernel_cl2char/cl2char.cpp
@@ -378,11 +378,7 @@ int main()
if (boltEnv == NULL) {
UNI_ERROR_LOG("BOLT_ROOT env value has not been set successfully\n");
};
- std::string boltPath = boltEnv;
- CI8 lastFlag = boltPath[boltPath.length() - 1];
- if (strcmp(&lastFlag, "/") != 0) {
- boltPath += "/";
- }
+ std::string boltPath = boltEnv + std::string("/");
std::string tensorComputingClPath = "compute/tensor/src/gpu/mali/cl/";
std::string tensorComputingClPathQc = "compute/tensor/src/gpu/mali/cl/qualcomm/";
std::string imageClPath = "compute/image/src/gpu/mali/cl/";
diff --git a/common/memory/include/memory_cpu.hpp b/common/memory/include/memory_cpu.hpp
index 7ac82f70..eb654d49 100644
--- a/common/memory/include/memory_cpu.hpp
+++ b/common/memory/include/memory_cpu.hpp
@@ -18,7 +18,7 @@
inline void *CPUMemoryAlignedAlloc(size_t alignment, size_t bytes)
{
- void *ptr = (void **)operator new(bytes + sizeof(void *) + alignment - 1);
+ void *ptr = (void **)UNI_OPERATOR_NEW(bytes + sizeof(void *) + alignment - 1);
CHECK_REQUIREMENT(ptr != NULL);
void **aligned_ptr =
(void **)(((uintptr_t)(ptr) + sizeof(void *) + alignment - 1) & ~(alignment - 1));
@@ -28,7 +28,7 @@ inline void *CPUMemoryAlignedAlloc(size_t alignment, size_t bytes)
inline void CPUMemoryAlignedfree(void *aligned_ptr)
{
- operator delete(((void **)aligned_ptr)[-1]);
+ UNI_OPERATOR_DELETE(((void **)aligned_ptr)[-1]);
}
class CpuMemory : public Memory {
@@ -39,7 +39,8 @@ class CpuMemory : public Memory {
this->allocated = false;
}
- ~CpuMemory() = default;
+ ~CpuMemory()
+ {}
std::shared_ptr clone(bool allocate) override
{
@@ -71,13 +72,13 @@ class CpuMemory : public Memory {
this->capacitySize = size;
try {
#ifndef _USE_X86
- this->val = std::shared_ptr((U8 *)operator new(size));
+ this->val = std::shared_ptr((U8 *)UNI_OPERATOR_NEW(size), UNI_OPERATOR_DELETE);
#else
this->val = std::shared_ptr(
(U8 *)CPUMemoryAlignedAlloc(64, size), CPUMemoryAlignedfree);
#endif
} catch (const std::bad_alloc &e) {
- UNI_ERROR_LOG("CPU memory alloc %d bytes failed\n", (int)size);
+ UNI_ERROR_LOG("CPU memory alloc %d bytes failed.\n", (int)size);
}
}
this->allocated = true;
@@ -179,7 +180,7 @@ class CpuMemory : public Memory {
std::string string(U32 num, F32 factor) override
{
U32 capacityNum = this->capacitySize / bytesOf(this->desc.dt);
- std::string line = "desc: " + tensorDesc2Str(this->desc) + " data:";
+ std::string line = "desc:" + tensorDesc2Str(this->desc) + " data:";
for (U32 i = 0; i < num && i < capacityNum; i++) {
line = line + std::to_string(this->element(i) / factor) + " ";
}
@@ -187,7 +188,7 @@ class CpuMemory : public Memory {
for (U32 i = 0; i < UNI_MIN(tensorNumElements(this->desc), capacityNum); i++) {
sum += this->element(i) / factor;
}
- line += " sum: " + std::to_string(sum);
+ line += " sum:" + std::to_string(sum);
return line;
}
diff --git a/common/memory/include/memory_ocl.hpp b/common/memory/include/memory_ocl.hpp
index a7194cce..488880f1 100644
--- a/common/memory/include/memory_ocl.hpp
+++ b/common/memory/include/memory_ocl.hpp
@@ -23,7 +23,7 @@ class OclMemory : public Memory {
public:
OclMemory()
{
- memset(&(this->desc), 0, sizeof(GCLMemDesc));
+ UNI_MEMSET(&(this->desc), 0, sizeof(GCLMemDesc));
this->desc.memFormat = DF_NCHW;
this->desc.memType = GCL_MEM_BUF;
this->desc.flags = CL_MEM_READ_WRITE;
@@ -202,14 +202,14 @@ class OclMemory : public Memory {
if (!allocated) {
U8 *tmp = nullptr;
if (size < this->desc.byteSize) {
- U8 *tmp = (U8 *)operator new(this->desc.byteSize);
- memset(tmp, 0, this->desc.byteSize);
- memcpy(tmp, host_ptr, size);
+ U8 *tmp = (U8 *)UNI_OPERATOR_NEW(this->desc.byteSize);
+ UNI_MEMSET(tmp, 0, this->desc.byteSize);
+ UNI_MEMCPY(tmp, host_ptr, size);
host_ptr = tmp;
}
this->alloc(host_ptr);
- if (tmp) {
- delete tmp;
+ if (tmp != nullptr) {
+ UNI_OPERATOR_DELETE(tmp);
}
} else {
this->val->desc = this->desc; //TODO DELETE AFTER SPLITE DESC FROM GCLMEM
@@ -345,7 +345,7 @@ class OclMemory : public Memory {
std::string string(U32 num, F32 factor) override
{
- std::string line = "desc: " + gclMemDesc2Str(this->desc) + " data: ";
+ std::string line = "desc:" + gclMemDesc2Str(this->desc) + " data:";
#ifdef _DEBUG
DataType dt = (this->desc.dt == DT_U8) ? DT_F16 : this->desc.dt;
if (dt == DT_U32) {
@@ -374,7 +374,7 @@ class OclMemory : public Memory {
for (U32 i = 0; i < this->length(); i++) {
sum += this->element(i) / factor;
}
- line += " sum: " + std::to_string(sum);
+ line += " sum:" + std::to_string(sum);
}
#endif
return line;
diff --git a/common/memory/include/memory_ocl_img.hpp b/common/memory/include/memory_ocl_img.hpp
index 6865aa43..abd6a7f1 100644
--- a/common/memory/include/memory_ocl_img.hpp
+++ b/common/memory/include/memory_ocl_img.hpp
@@ -127,9 +127,9 @@ class OclMemoryImg : public OclMemory {
U8 *tmp = nullptr;
if (size < this->desc.byteSize) {
if (this->get_mem_type() == OCLMemImg1D) {
- U8 *tmp = (U8 *)operator new(this->bytes());
- memset(tmp, 0, this->bytes());
- memcpy(tmp, host_ptr, size);
+ tmp = (U8 *)UNI_OPERATOR_NEW(this->bytes());
+ UNI_MEMSET(tmp, 0, this->bytes());
+ UNI_MEMCPY(tmp, host_ptr, size);
host_ptr = tmp;
} else {
CHECK_STATUS(NOT_MATCH);
@@ -146,6 +146,9 @@ class OclMemoryImg : public OclMemory {
CHECK_STATUS(NOT_SUPPORTED);
}
}
+ if (tmp != nullptr) {
+ UNI_OPERATOR_DELETE(tmp);
+ }
} else {
if (!allocated) {
this->alloc();
diff --git a/common/memory/include/tensor.hpp b/common/memory/include/tensor.hpp
index abd6d20f..37966c27 100644
--- a/common/memory/include/tensor.hpp
+++ b/common/memory/include/tensor.hpp
@@ -85,6 +85,11 @@ class Tensor {
*(this->scale) = scale;
}
+ void set_scale_ptr(std::shared_ptr scale)
+ {
+ this->scale = scale;
+ }
+
F32 get_scale()
{
return *(this->scale);
@@ -97,7 +102,7 @@ class Tensor {
void copy_from(Tensor *other)
{
- memcpy(this->scale.get(), other->scale.get(), sizeof(F32));
+ UNI_MEMCPY(this->scale.get(), other->scale.get(), sizeof(F32));
this->val->copy_from(other->val.get());
}
diff --git a/common/memory/include/tensor_common.h b/common/memory/include/tensor_common.h
index b0672299..ab912567 100644
--- a/common/memory/include/tensor_common.h
+++ b/common/memory/include/tensor_common.h
@@ -40,7 +40,7 @@ static void transformToNCHWKernel(
case DF_NCHW: {
if (in == on && ic == oc && ih == oh && iw == ow) {
if (output != input) {
- memcpy(output, input, tensorNumBytes(outputDesc));
+ UNI_MEMCPY(output, input, tensorNumBytes(outputDesc));
}
} else {
U32 tileSize = UNI_MIN(iw, ow) * bytesOf(idt);
@@ -49,7 +49,7 @@ static void transformToNCHWKernel(
for (U32 h = 0; h < oh && h < ih; h++) {
U32 srcIndex = ((n * ic + c) * ih + h) * iw;
U32 dstIndex = ((n * oc + c) * oh + h) * ow;
- memcpy(output + dstIndex, input + srcIndex, tileSize);
+ UNI_MEMCPY(output + dstIndex, input + srcIndex, tileSize);
}
}
}
@@ -169,7 +169,7 @@ static void transformToNHWCKernel(
case DF_NHWC: {
CHECK_REQUIREMENT(tensorNumElements(inputDesc) == size);
if (input != output) {
- memcpy(output, input, tensorNumBytes(inputDesc));
+ UNI_MEMCPY(output, input, tensorNumBytes(inputDesc));
}
break;
}
@@ -262,9 +262,9 @@ EE transformNCHWToNCHWC8(
// support channel padding
if (c_i < ic) {
U32 srcIndex = (((n * ic + c_i) * ih + h) * iw + w) * elementSize;
- memcpy(outputPtr + dstIndex, inputPtr + srcIndex, elementSize);
+ UNI_MEMCPY(outputPtr + dstIndex, inputPtr + srcIndex, elementSize);
} else {
- memset(outputPtr + dstIndex, 0, elementSize);
+ UNI_MEMSET(outputPtr + dstIndex, 0, elementSize);
}
}
}
@@ -299,9 +299,9 @@ EE transformNHWCToNCHWC8(
// support channel padding
if (c_i < ic) {
U32 srcIndex = (((n * ih + h) * iw + w) * ic + c_i) * elementSize;
- memcpy(outputPtr + dstIndex, inputPtr + srcIndex, elementSize);
+ UNI_MEMCPY(outputPtr + dstIndex, inputPtr + srcIndex, elementSize);
} else {
- memset(outputPtr + dstIndex, 0, elementSize);
+ UNI_MEMSET(outputPtr + dstIndex, 0, elementSize);
}
}
}
@@ -318,7 +318,7 @@ EE transformNCHWC8ToNCHWC8ByGroup(
U32 outputSize = tensorNumElements(outputDesc);
if (group <= 1 || inputSize == outputSize) {
if (input != output) {
- memcpy(output, input, outputSize);
+ UNI_MEMCPY(output, input, outputSize);
}
return SUCCESS;
}
@@ -354,10 +354,10 @@ EE transformNCHWC8ToNCHWC8ByGroup(
U32 srcIndex =
((((n * ict + id_a) * ih + h) * iw + w) * channelAlignSize + id_b) *
elementSize;
- memcpy(
+ UNI_MEMCPY(
(U8 *)output + dstIndex, (const U8 *)input + srcIndex, elementSize);
} else {
- memset((U8 *)output + dstIndex, 0, elementSize);
+ UNI_MEMSET((U8 *)output + dstIndex, 0, elementSize);
}
}
}
@@ -417,7 +417,7 @@ EE transposeFilter(TensorDesc inputDesc, const void *input, TensorDesc outputDes
for (U32 hw = 0; hw < ih * iw; hw++) {
U32 srcIndex = o * ih * iw * innerSize + hw * innerSize;
U32 dstIndex = o * ih * iw * innerSize + (hwMax - hw) * innerSize;
- memcpy(outputPtr + dstIndex, inputPtr + srcIndex, innerSize);
+ UNI_MEMCPY(outputPtr + dstIndex, inputPtr + srcIndex, innerSize);
}
}
break;
@@ -475,7 +475,7 @@ EE array_transpose(DataType dt,
inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1];
}
inputIndex += inputLocalIndex[sizeInnerIndex];
- memcpy(outputPtr + i * tileSize, inputPtr + inputIndex * tileSize, tileSize);
+ UNI_MEMCPY(outputPtr + i * tileSize, inputPtr + inputIndex * tileSize, tileSize);
}
return SUCCESS;
@@ -513,7 +513,7 @@ EE array_transpose_naive(DataType dt,
inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1];
}
inputIndex += inputLocalIndex[0];
- memcpy(outputPtr + i * tileSize, inputPtr + inputIndex * tileSize, tileSize);
+ UNI_MEMCPY(outputPtr + i * tileSize, inputPtr + inputIndex * tileSize, tileSize);
}
return SUCCESS;
diff --git a/common/memory/include/tensor_desc.h b/common/memory/include/tensor_desc.h
index f3b92a6e..d2353a02 100644
--- a/common/memory/include/tensor_desc.h
+++ b/common/memory/include/tensor_desc.h
@@ -20,11 +20,14 @@
#include "data_type.h"
#include "error.h"
+#include "secure_c_wrapper.h"
#ifdef _USE_GPU
#define CL_TARGET_OPENCL_VERSION 200
#include "CL/cl.h"
#endif
+#define DIM_LEN 6
+
typedef enum {
DF_NCHW,
DF_NCHWN16, // vectorize for N=16, for filter
@@ -68,7 +71,8 @@ typedef enum {
DF_NKN12K4, // Optimized MMM filter for INT8
DF_NKNx_NKN32, // Optimized LSTM filter
DF_NCHWC16, // vectorize for C=16, for input and output
- DF_NCHWC2NxC4
+ DF_NCHWC2NxC4,
+ DF_SCALAR
} DataFormat;
inline const char *const *DataFormatName()
@@ -79,7 +83,8 @@ inline const char *const *DataFormatName()
"DF_MKT", "DF_NK", "DF_NKN16", "DF_NKN32", "DF_NKN64", "DF_NKN32K4", "DF_NCHWC4",
"DF_NCHWC3", "DF_NHWC", "DF_NCHWN4C4", "DF_NCHWN4", "DF_HWCN", "DF_NCWHN4C4", "DF_NHWCN4",
"DF_CHWNC4", "DF_CHWNC8", "DF_CHWNC16", "DF_CHWC8_NCN8", "DF_RGB", "DF_HWNCN8", "DF_NKN24",
- "DF_NKN12", "DF_NKN8", "DF_NKN12K4", "DF_NKNx_NKN32", "DF_NCHWC16", "DF_NCHWC2NxC4"};
+ "DF_NKN12", "DF_NKN8", "DF_NKN12K4", "DF_NKNx_NKN32", "DF_NCHWC16", "DF_NCHWC2NxC4",
+ "DF_SCALAR"};
return names;
}
@@ -87,13 +92,13 @@ typedef struct TensorDesc {
DataType dt = DT_U8;
DataFormat df = DF_NCHW;
U32 nDims = 0;
- U32 dims[6] = {0};
+ U32 dims[DIM_LEN] = {0};
} TensorDesc;
inline TensorDesc tensor0d()
{
TensorDesc desc;
- memset(&desc, 0, sizeof(TensorDesc));
+ UNI_MEMSET(&desc, 0, sizeof(TensorDesc));
return desc;
}
@@ -365,20 +370,38 @@ inline U8 tensorIs5d(TensorDesc desc)
return 5 == desc.nDims;
}
+// in order to support shape calculation, there is a reserved buffer in TensorDesc.dims to save.
+inline U8 tensorIsShape(TensorDesc desc)
+{
+ U32 length = tensorNumElements(desc);
+ U8 ret = 0;
+ if (desc.dt == DT_U32 && length > 0 && length + desc.nDims <= DIM_LEN) {
+ ret = 1;
+ }
+ return ret;
+}
+
inline std::string tensorDesc2Str(TensorDesc desc)
{
std::string descStr = "dt:" + std::string(DataTypeName()[desc.dt]) +
" df:" + std::string(DataFormatName()[desc.df]) + " dims:" + std::to_string(desc.nDims);
-
if (desc.nDims > 0) {
descStr += "(";
- }
- for (I32 i = int(desc.nDims) - 1; i >= 0; i--) {
- descStr += std::to_string(desc.dims[i]);
- if (i > 0) {
- descStr += ",";
- } else {
- descStr += ")";
+ for (I32 i = int(desc.nDims) - 1; i > 0; i--) {
+ descStr += std::to_string(desc.dims[i]) + ",";
+ }
+ descStr += std::to_string(desc.dims[0]) + ")";
+ if (tensorIsShape(desc)) {
+ U32 length = tensorNumElements(desc);
+ descStr += " reserve:(";
+ for (U32 i = desc.nDims; i < desc.nDims + length && i < DIM_LEN; i++) {
+ descStr += std::to_string((int)desc.dims[i]);
+ if (i + 1 < desc.nDims + length && i + 1 < DIM_LEN) {
+ descStr += ",";
+ } else {
+ descStr += ")";
+ }
+ }
}
}
@@ -387,15 +410,15 @@ inline std::string tensorDesc2Str(TensorDesc desc)
inline int tensorDescIsValid(TensorDesc desc)
{
- if (desc.dt < 0 || desc.dt >= 10) {
+ if (desc.dt < 0 || desc.dt >= DT_NUM) {
return 0;
}
- if (desc.df < 0 || desc.df >= 30) {
+ if (desc.df < 0 || desc.df >= 50) {
return 0;
}
- if (desc.nDims > 6) {
+ if (desc.nDims > DIM_LEN) {
return 0;
}
@@ -427,6 +450,7 @@ inline DataFormat getTensorDefaultDataFormat(int nDims)
return df;
}
+// return format is [w, h, c, n]
inline std::vector calculateLocalIndex(U32 index, const U32 *dims, U32 nDims)
{
std::vector indexes(nDims);
@@ -441,7 +465,8 @@ inline U32 calculateGlobalIndex(const U32 *indexes, const U32 *dims, U32 nDims)
{
U32 index = 0;
for (int i = ((int)nDims) - 1; i >= 0; i--) {
- index = index * dims[i] + indexes[i];
+ U32 value = indexes[i] >= dims[i] ? 0 : indexes[i];
+ index = index * dims[i] + value;
}
return index;
}
@@ -470,13 +495,13 @@ typedef enum {
} GCLMemType;
struct GCLMemDesc {
- U32 dims[6];
+ U32 dims[DIM_LEN];
U32 nDims;
DataType dt;
DataFormat df;
U32 stride[3];
- U32 offset[6];
+ U32 offset[DIM_LEN];
GCLMemType memType;
DataFormat memFormat;
U32 byteSize;
diff --git a/common/memory/include/tensor_transpose.h b/common/memory/include/tensor_transpose.h
index 5a37ab33..63a097f4 100644
--- a/common/memory/include/tensor_transpose.h
+++ b/common/memory/include/tensor_transpose.h
@@ -16,10 +16,10 @@
#include "tensor_desc.h"
#include "uni.h"
-#include "thread_affinity.h"
+#include "affinity_policy.h"
template
-inline static void transformToNCHWKernel(
+inline static EE transformToNCHWKernel(
TensorDesc inputDesc, const T *input, TensorDesc outputDesc, T *output)
{
DataType idt, odt;
@@ -40,24 +40,30 @@ inline static void transformToNCHWKernel(
} else if (tensorIs4d(inputDesc)) {
CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
} else {
- UNI_ERROR_LOG("not support transform %d-dim tensor to NCHW format\n", (int)inputDesc.nDims);
- return;
+ UNI_ERROR_LOG("not support transform %d-dim tensor to NCHW format.\n", (int)inputDesc.nDims);
+ return NOT_SUPPORTED;
}
- if (tensorIs3d(outputDesc)) {
+ if (tensorIs2d(outputDesc)) {
+ CHECK_STATUS(tensor2dGet(outputDesc, &odt, &odf, &on, &oc));
+ oh = ow = 1;
+ } else if (tensorIs3d(outputDesc)) {
CHECK_STATUS(tensor3dGet(outputDesc, &odt, &odf, &on, &oc, &oh));
ow = 1;
} else if (tensorIs4d(outputDesc)) {
CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
} else {
- UNI_ERROR_LOG("not support transform to %d-dim NCHW tensor\n", (int)outputDesc.nDims);
- return;
+ UNI_ERROR_LOG("not support transform to %d-dim NCHW tensor.\n", (int)outputDesc.nDims);
+ return NOT_SUPPORTED;
}
CHECK_REQUIREMENT(idt == odt);
+ EE ret = SUCCESS;
switch (idf) {
+ case DF_NORMAL:
+ case DF_MTK:
case DF_NCHW: {
if (in == on && ic == oc && ih == oh && iw == ow) {
if (output != input) {
- memcpy(output, input, tensorNumBytes(outputDesc));
+ UNI_MEMCPY(output, input, tensorNumBytes(outputDesc));
}
} else {
U32 tileSize = UNI_MIN(iw, ow) * bytesOf(idt);
@@ -66,7 +72,7 @@ inline static void transformToNCHWKernel(
for (U32 h = 0; h < oh && h < ih; h++) {
U32 srcIndex = ((n * ic + c) * ih + h) * iw;
U32 dstIndex = ((n * oc + c) * oh + h) * ow;
- memcpy(output + dstIndex, input + srcIndex, tileSize);
+ UNI_MEMCPY(output + dstIndex, input + srcIndex, tileSize);
}
}
}
@@ -160,49 +166,56 @@ inline static void transformToNCHWKernel(
break;
}
default: {
- UNI_ERROR_LOG("not support transform %s format to NCHW format\n", DataFormatName()[idf]);
+ UNI_ERROR_LOG(
+ "not support transform %s format to NCHW format.\n", DataFormatName()[idf]);
+ ret = NOT_SUPPORTED;
+ break;
}
}
+ return ret;
}
inline EE transformToNCHW(
TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output)
{
if (nullptr == input || nullptr == output) {
- return NULL_POINTER;
+ CHECK_STATUS(NULL_POINTER);
}
+ EE ret = NOT_SUPPORTED;
switch (inputDesc.dt) {
#ifdef _USE_FP32
case DT_F32: {
- transformToNCHWKernel(inputDesc, (F32 *)input, outputDesc, (F32 *)output);
+ ret = transformToNCHWKernel(inputDesc, (F32 *)input, outputDesc, (F32 *)output);
break;
}
#endif
#ifdef _USE_FP16
case DT_F16: {
- transformToNCHWKernel(inputDesc, (F16 *)input, outputDesc, (F16 *)output);
+ ret = transformToNCHWKernel(inputDesc, (F16 *)input, outputDesc, (F16 *)output);
break;
}
#endif
#ifdef _USE_INT8
case DT_I8: {
- transformToNCHWKernel(inputDesc, (INT8 *)input, outputDesc, (INT8 *)output);
+ ret = transformToNCHWKernel(inputDesc, (INT8 *)input, outputDesc, (INT8 *)output);
break;
}
case DT_U8_Q: {
- transformToNCHWKernel(inputDesc, (UINT8 *)input, outputDesc, (UINT8 *)output);
+ ret = transformToNCHWKernel(
+ inputDesc, (UINT8 *)input, outputDesc, (UINT8 *)output);
break;
}
#endif
default: {
- return NOT_SUPPORTED;
+ UNI_ERROR_LOG("not support transform %s type tensor.\n", DataTypeName()[inputDesc.dt]);
+ break;
}
}
- return SUCCESS;
+ return ret;
}
template
-inline static void transformToNHWCKernel(
+inline static EE transformToNHWCKernel(
TensorDesc inputDesc, const T *input, TensorDesc outputDesc, T *output)
{
DataType idt, odt;
@@ -219,19 +232,27 @@ inline static void transformToNHWCKernel(
CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
} else {
UNI_ERROR_LOG("not support transform %d-dim tensor to NHWC format\n", (int)inputDesc.nDims);
- return;
+ return NOT_SUPPORTED;
+ }
+ if (tensorIs4d(outputDesc)) {
+ CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
+ } else {
+ UNI_ERROR_LOG("not support transform to %d-dim NHWC tensor.\n", (int)outputDesc.nDims);
+ return NOT_SUPPORTED;
}
- CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
U32 size = tensorNumElements(outputDesc);
U32 ihiw = ih * iw;
+ EE ret = SUCCESS;
switch (idf) {
case DF_NHWC: {
CHECK_REQUIREMENT(tensorNumElements(inputDesc) == size);
if (input != output) {
- memcpy(output, input, tensorNumBytes(inputDesc));
+ UNI_MEMCPY(output, input, tensorNumBytes(inputDesc));
}
break;
}
+ case DF_NORMAL:
+ case DF_MTK:
case DF_NCHW: {
CHECK_REQUIREMENT(tensorNumElements(inputDesc) == size);
for (U32 o = 0, srcIndex = 0; o < in; o++) {
@@ -244,14 +265,16 @@ inline static void transformToNHWCKernel(
}
break;
}
- case DF_NCHWC8: {
- CHECK_REQUIREMENT(ic % 8 == 0);
- ic /= 8;
+ case DF_NCHWC8:
+ case DF_NCHWC16: {
+ U32 align = (idf == DF_NCHWC16) ? 16 : 8;
+ CHECK_REQUIREMENT(ic % align == 0);
+ ic /= align;
for (U32 n = 0, srcIndex = 0; n < in; n++) {
for (U32 c = 0; c < ic; c++) {
for (U32 hw = 0; hw < ihiw; hw++) {
- for (U32 c8 = 0; c8 < 8; c8++, srcIndex++) {
- U32 dstIndex = ((n * ihiw + hw) * ic + c) * 8 + c8;
+ for (U32 cx = 0; cx < align; cx++, srcIndex++) {
+ U32 dstIndex = ((n * ihiw + hw) * ic + c) * align + cx;
output[dstIndex] = input[srcIndex];
}
}
@@ -262,8 +285,11 @@ inline static void transformToNHWCKernel(
default: {
UNI_ERROR_LOG(
"not support transform %s format tensor to NHWC format\n", DataFormatName()[idf]);
+ ret = NOT_SUPPORTED;
+ break;
}
}
+ return ret;
}
inline EE transformToNHWC(
@@ -272,30 +298,32 @@ inline EE transformToNHWC(
if (nullptr == input || nullptr == output) {
return NULL_POINTER;
}
+ EE ret = NOT_SUPPORTED;
switch (inputDesc.dt) {
#ifdef _USE_FP32
case DT_F32: {
- transformToNHWCKernel(inputDesc, (F32 *)input, outputDesc, (F32 *)output);
+ ret = transformToNHWCKernel(inputDesc, (F32 *)input, outputDesc, (F32 *)output);
break;
}
#endif
#ifdef _USE_FP16
case DT_F16: {
- transformToNHWCKernel(inputDesc, (F16 *)input, outputDesc, (F16 *)output);
+ ret = transformToNHWCKernel(inputDesc, (F16 *)input, outputDesc, (F16 *)output);
break;
}
#endif
#ifdef _USE_INT8
case DT_I8: {
- transformToNHWCKernel(inputDesc, (INT8 *)input, outputDesc, (INT8 *)output);
+ ret = transformToNHWCKernel(inputDesc, (INT8 *)input, outputDesc, (INT8 *)output);
break;
}
#endif
default: {
- return NOT_SUPPORTED;
+ UNI_ERROR_LOG("not support transform %s type tensor.\n", DataTypeName()[inputDesc.dt]);
+ break;
}
}
- return SUCCESS;
+ return ret;
}
inline EE transformNCHWC16ToNCHWC8(
@@ -309,7 +337,7 @@ inline EE transformNCHWC16ToNCHWC8(
U32 in, ic, ih, iw, on, oc, oh, ow;
if (tensorIs2d(inputDesc)) {
if (input != output) {
- memcpy(output, input, tensorNumBytes(inputDesc));
+ UNI_MEMCPY(output, input, tensorNumBytes(inputDesc));
}
return SUCCESS;
} else if (tensorIs3d(inputDesc)) {
@@ -333,7 +361,7 @@ inline EE transformNCHWC16ToNCHWC8(
U32 srcIndex =
n * ic * ih * iw + c * ih * iw * 8 + (h * iw + w) * 16 + c8 * 8;
U32 dstIndex = n * ic * ih * iw + (c + c8) * ih * iw * 8 + (h * iw + w) * 8;
- memcpy(outputPtr + dstIndex * elementSize,
+ UNI_MEMCPY(outputPtr + dstIndex * elementSize,
inputPtr + srcIndex * elementSize, elementSize * 8);
}
}
@@ -354,7 +382,7 @@ inline EE transformNCHWToNCHWC8(
U32 in, ic, ih, iw, on, oc, oh, ow;
if (tensorIs2d(inputDesc)) {
if (input != output) {
- memcpy(output, input, tensorNumBytes(inputDesc));
+ UNI_MEMCPY(output, input, tensorNumBytes(inputDesc));
}
return SUCCESS;
} else if (tensorIs3d(inputDesc)) {
@@ -379,9 +407,9 @@ inline EE transformNCHWToNCHWC8(
// support channel padding
if (c_i < ic) {
U32 srcIndex = (((n * ic + c_i) * ih + h) * iw + w) * elementSize;
- memcpy(outputPtr + dstIndex, inputPtr + srcIndex, elementSize);
+ UNI_MEMCPY(outputPtr + dstIndex, inputPtr + srcIndex, elementSize);
} else {
- memset(outputPtr + dstIndex, 0, elementSize);
+ UNI_MEMSET(outputPtr + dstIndex, 0, elementSize);
}
}
}
@@ -416,9 +444,9 @@ inline EE transformNHWCToNCHWC8(
// support channel padding
if (c_i < ic) {
U32 srcIndex = (((n * ih + h) * iw + w) * ic + c_i) * elementSize;
- memcpy(outputPtr + dstIndex, inputPtr + srcIndex, elementSize);
+ UNI_MEMCPY(outputPtr + dstIndex, inputPtr + srcIndex, elementSize);
} else {
- memset(outputPtr + dstIndex, 0, elementSize);
+ UNI_MEMSET(outputPtr + dstIndex, 0, elementSize);
}
}
}
@@ -435,7 +463,7 @@ inline EE transformNCHWC8ToNCHWC8ByGroup(
U32 outputSize = tensorNumElements(outputDesc);
if (group <= 1 || inputSize == outputSize) {
if (input != output) {
- memcpy(output, input, outputSize);
+ UNI_MEMCPY(output, input, outputSize);
}
return SUCCESS;
}
@@ -471,10 +499,10 @@ inline EE transformNCHWC8ToNCHWC8ByGroup(
U32 srcIndex =
((((n * ict + id_a) * ih + h) * iw + w) * channelAlignSize + id_b) *
elementSize;
- memcpy(
+ UNI_MEMCPY(
(U8 *)output + dstIndex, (const U8 *)input + srcIndex, elementSize);
} else {
- memset((U8 *)output + dstIndex, 0, elementSize);
+ UNI_MEMSET((U8 *)output + dstIndex, 0, elementSize);
}
}
}
@@ -485,7 +513,7 @@ inline EE transformNCHWC8ToNCHWC8ByGroup(
}
template
-inline static void transformToNCHWC16Kernel(
+inline static EE transformToNCHWC16Kernel(
TensorDesc inputDesc, const T *input, TensorDesc outputDesc, T *output)
{
DataType idt, odt;
@@ -508,7 +536,7 @@ inline static void transformToNCHWC16Kernel(
} else {
UNI_ERROR_LOG(
"not support transform %d-dim tensor to NCHWC16 format\n", (int)inputDesc.nDims);
- return;
+ return NOT_SUPPORTED;
}
if (tensorIs3d(outputDesc)) {
CHECK_STATUS(tensor3dGet(outputDesc, &odt, &odf, &on, &oc, &oh));
@@ -517,10 +545,12 @@ inline static void transformToNCHWC16Kernel(
CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
} else {
UNI_ERROR_LOG("not support transform to %d-dim NCHWC16 tensor\n", (int)outputDesc.nDims);
- return;
+ return NOT_SUPPORTED;
}
CHECK_REQUIREMENT(idt == odt);
+ EE ret = SUCCESS;
switch (idf) {
+ case DF_NORMAL:
case DF_MTK:
case DF_NCHW: {
U32 ic16 = ic / 16;
@@ -593,8 +623,11 @@ inline static void transformToNCHWC16Kernel(
default: {
UNI_ERROR_LOG(
"not support transform %s format to NCHWC16 format\n", DataFormatName()[idf]);
+ ret = NOT_SUPPORTED;
+ break;
}
}
+ return ret;
}
inline EE transformToNCHWC16(
@@ -603,37 +636,40 @@ inline EE transformToNCHWC16(
if (nullptr == input || nullptr == output) {
return NULL_POINTER;
}
+ EE ret = NOT_SUPPORTED;
switch (inputDesc.dt) {
#ifdef _USE_FP32
case DT_F32: {
- transformToNCHWC16Kernel(inputDesc, (F32 *)input, outputDesc, (F32 *)output);
+ ret = transformToNCHWC16Kernel(inputDesc, (F32 *)input, outputDesc, (F32 *)output);
break;
}
#endif
#ifdef _USE_INT8
case DT_U8_Q: {
- transformToNCHWC16Kernel(inputDesc, (UINT8 *)input, outputDesc, (UINT8 *)output);
+ ret = transformToNCHWC16Kernel(
+ inputDesc, (UINT8 *)input, outputDesc, (UINT8 *)output);
break;
}
#endif
default: {
- return NOT_SUPPORTED;
+ UNI_ERROR_LOG("not support transform %s type tensor.\n", DataTypeName()[inputDesc.dt]);
+ break;
}
}
- return SUCCESS;
+ return ret;
}
inline EE transformFormat(
TensorDesc inputDesc, const void *input, TensorDesc outputDesc, void *output)
{
EE ret = NOT_SUPPORTED;
- if (outputDesc.df == DF_NCHW) {
+ if (outputDesc.df == DF_NCHW || outputDesc.df == DF_MTK || outputDesc.df == DF_NORMAL) {
ret = transformToNCHW(inputDesc, input, outputDesc, output);
} else if (outputDesc.df == DF_NCHWC8) {
if (inputDesc.df == DF_NORMAL) {
- memcpy(output, input, tensorNumBytes(inputDesc));
+ UNI_MEMCPY(output, input, tensorNumBytes(inputDesc));
ret = SUCCESS;
- } else if (inputDesc.df == DF_NCHW || inputDesc.df == DF_MTK) {
+ } else if (inputDesc.df == DF_NCHW || inputDesc.df == DF_MTK || inputDesc.df == DF_NORMAL) {
ret = transformNCHWToNCHWC8(inputDesc, input, outputDesc, output);
} else if (inputDesc.df == DF_NHWC) {
ret = transformNHWCToNCHWC8(inputDesc, input, outputDesc, output);
@@ -648,6 +684,8 @@ inline EE transformFormat(
}
} else if (outputDesc.df == DF_NCHWC16) {
ret = transformToNCHWC16(inputDesc, input, outputDesc, output);
+ } else if (outputDesc.df == DF_NHWC) {
+ ret = transformToNHWC(inputDesc, input, outputDesc, output);
} else {
UNI_ERROR_LOG("layout transpose can not support transform to %s format.\n",
DataFormatName()[outputDesc.df]);
@@ -664,34 +702,39 @@ inline EE transposeFilter(
DataType idt, odt;
DataFormat idf, odf;
U32 in, ic, ih, iw, on, oc, oh, ow;
- CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
- CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
+ if (tensorIs4d(inputDesc) && tensorIs4d(outputDesc)) {
+ CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
+ CHECK_STATUS(tensor4dGet(outputDesc, &odt, &odf, &on, &oc, &oh, &ow));
+ } else {
+ UNI_ERROR_LOG("currently only support to transpose 4-dim filter.\n");
+ return NOT_SUPPORTED;
+ }
CHECK_REQUIREMENT(idf == odf);
- const U8 *inputPtr = (const U8 *)input;
- U8 *outputPtr = (U8 *)output;
-
+ const U8 *src = (const U8 *)input;
+ U8 *dst = (U8 *)output;
+ EE ret = SUCCESS;
switch (idf) {
case DF_NHWCN8: {
CHECK_REQUIREMENT(in % 8 == 0);
in /= 8;
U32 hwMax = ih * iw - 1;
-
U32 innerSize = bytesOf(idt) * ic * 8;
-
for (U32 o = 0; o < in; o++) {
for (U32 hw = 0; hw < ih * iw; hw++) {
U32 srcIndex = o * ih * iw * innerSize + hw * innerSize;
U32 dstIndex = o * ih * iw * innerSize + (hwMax - hw) * innerSize;
- memcpy(outputPtr + dstIndex, inputPtr + srcIndex, innerSize);
+ UNI_MEMCPY(dst + dstIndex, src + srcIndex, innerSize);
}
}
break;
}
default: {
- CHECK_STATUS(NOT_SUPPORTED);
+ UNI_ERROR_LOG(
+ "currently not support to transpose %s format filter.\n", DataFormatName()[idf]);
+ ret = NOT_SUPPORTED;
+ break;
}
}
- return SUCCESS;
+ return ret;
}
-
#endif
diff --git a/common/model_spec/include/model_common.h b/common/model_spec/include/model_common.h
index 264f618f..b5b255e9 100644
--- a/common/model_spec/include/model_common.h
+++ b/common/model_spec/include/model_common.h
@@ -16,10 +16,33 @@
#include
#include "model_spec.h"
+#include "memory_cpu.h"
EE str_copy(I8 *dst, const I8 *src, I32 src_len, I32 dst_len = NAME_LEN);
-void *mt_new_storage(size_t size);
+inline void *mt_malloc(U32 size)
+{
+ return UNI_OPERATOR_NEW(size);
+}
+
+template
+inline void mt_free(T *&p)
+{
+ UNI_OPERATOR_DELETE(p);
+ p = nullptr;
+}
+
+// only WeightSpec's weight and vec varialbles free by using this.
+// because this will use mmap memory.
+template
+inline void mt_free(T *&p, ModelSpec *spec)
+{
+ if (spec == nullptr || spec->mfd == nullptr || (uintptr_t(p) < uintptr_t(spec->mfd->bytes)) ||
+ (uintptr_t(p) >= uintptr_t(spec->mfd->bytes + spec->mfd->fileLength))) {
+ UNI_OPERATOR_DELETE(p);
+ }
+ p = nullptr;
+}
OperatorSpec mt_create_operator(
const char *name, OperatorType type, U32 num_inputs, U32 num_outputs);
@@ -34,4 +57,7 @@ bool isDeprecatedOp(OperatorType opType);
bool isDeprecatedOpWeight(const ModelSpec *spec, int index);
std::string concat_dir_file(std::string dir, std::string file);
+
+void modify_ms_inputs_and_outputs(
+ ModelSpec *ms, std::string modifiedInputs, std::string modifiedOutputs);
#endif
diff --git a/common/model_spec/include/model_spec.h b/common/model_spec/include/model_spec.h
index 3df6008f..121c79e4 100644
--- a/common/model_spec/include/model_spec.h
+++ b/common/model_spec/include/model_spec.h
@@ -16,7 +16,7 @@
#include "parameter_spec.h"
-static const int sg_boltVersion = 20201120;
+static const int sg_boltVersion = 20220126;
static const int sg_magicNumber = 1141119;
#pragma pack(8)
@@ -87,14 +87,10 @@ typedef struct {
} ModelSpec;
#pragma pack()
-#define outOfFileMapRange(addr, mfd) \
- ((mfd == nullptr) || (uintptr_t(addr) < uintptr_t(mfd->bytes)) || \
- (uintptr_t(addr) >= uintptr_t(mfd->bytes + mfd->fileLength)))
-
-EE mt_create_model(ModelSpec *md);
+EE mt_create_model(ModelSpec *spec);
EE serialize_model_to_file(const ModelSpec *spec, const char *fn);
EE deserialize_model_from_file(const char *fn, ModelSpec *spec, bool useFileStream = false);
-EE mt_destroy_model(ModelSpec *md);
+EE mt_destroy_model(ModelSpec *spec);
#include "model_print.h"
#endif
diff --git a/common/model_spec/src/CMakeLists.txt b/common/model_spec/src/CMakeLists.txt
index e6efbc94..d610b2d6 100644
--- a/common/model_spec/src/CMakeLists.txt
+++ b/common/model_spec/src/CMakeLists.txt
@@ -3,6 +3,9 @@ file(GLOB srcs ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
# shared library
add_library(${PROJECT_NAME} SHARED ${srcs})
target_link_libraries(${PROJECT_NAME} LINK_PUBLIC uni)
+if (USE_SECURE_C)
+ target_link_libraries(${PROJECT_NAME} LINK_PUBLIC ${SecureC_SHARED_LIBRARY})
+endif ()
# static library
add_library(${PROJECT_NAME}_static STATIC ${srcs})
diff --git a/common/model_spec/src/model_common.cpp b/common/model_spec/src/model_common.cpp
index 1dee4ae8..96b24691 100644
--- a/common/model_spec/src/model_common.cpp
+++ b/common/model_spec/src/model_common.cpp
@@ -17,7 +17,7 @@
OperatorSpec mt_create_operator(const char *name, OperatorType type, U32 num_inputs, U32 num_outputs)
{
OperatorSpec newOperator;
- memset(&(newOperator), 0, sizeof(OperatorSpec));
+ UNI_MEMSET(&(newOperator), 0, sizeof(OperatorSpec));
U32 length = UNI_MIN(strlen(name), NAME_LEN - 1);
str_copy(newOperator.name, name, length);
if (length < NAME_LEN) {
@@ -25,14 +25,14 @@ OperatorSpec mt_create_operator(const char *name, OperatorType type, U32 num_inp
}
newOperator.type = type;
newOperator.num_inputs = num_inputs;
- newOperator.input_tensors_name = (I8 **)mt_new_storage(num_inputs * sizeof(I8 *));
+ newOperator.input_tensors_name = (I8 **)mt_malloc(num_inputs * sizeof(I8 *));
for (U32 i = 0; i < num_inputs; i++) {
- newOperator.input_tensors_name[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+ newOperator.input_tensors_name[i] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
}
newOperator.num_outputs = num_outputs;
- newOperator.output_tensors_name = (I8 **)mt_new_storage(num_outputs * sizeof(I8 *));
+ newOperator.output_tensors_name = (I8 **)mt_malloc(num_outputs * sizeof(I8 *));
for (U32 i = 0; i < num_outputs; i++) {
- newOperator.output_tensors_name[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+ newOperator.output_tensors_name[i] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
}
newOperator.tensor_positions = NULL;
newOperator.num_quant_feature = 0;
@@ -46,7 +46,7 @@ EE mt_insert_operator(ModelSpec *ms, int index, OperatorSpec newOperator)
return NULL_POINTER;
}
OperatorSpec *operatorList =
- (OperatorSpec *)mt_new_storage(sizeof(OperatorSpec) * (ms->num_operator_specs + 1));
+ (OperatorSpec *)mt_malloc(sizeof(OperatorSpec) * (ms->num_operator_specs + 1));
for (int i = 0; i < index; i++) {
operatorList[i] = ms->ops[i];
}
@@ -54,7 +54,7 @@ EE mt_insert_operator(ModelSpec *ms, int index, OperatorSpec newOperator)
for (int i = index; i < ms->num_operator_specs; i++) {
operatorList[i + 1] = ms->ops[i];
}
- delete ms->ops;
+ mt_free(ms->ops);
ms->ops = operatorList;
ms->num_operator_specs++;
return SUCCESS;
@@ -64,7 +64,7 @@ WeightSpec mt_create_weight(
const char *name, DataType dataType, U32 bytesOfWeight, U32 bytesOfVec, U32 numQuantScale)
{
WeightSpec newWeight;
- memset(&(newWeight), 0, sizeof(WeightSpec));
+ UNI_MEMSET(&(newWeight), 0, sizeof(WeightSpec));
U32 length = UNI_MIN(strlen(name), NAME_LEN - 1);
str_copy(newWeight.op_name, name, length);
if (length < NAME_LEN) {
@@ -72,11 +72,11 @@ WeightSpec mt_create_weight(
}
newWeight.mdt = dataType;
newWeight.bytes_of_weight = bytesOfWeight;
- newWeight.weight = (U8 *)mt_new_storage(bytesOfWeight);
+ newWeight.weight = (U8 *)mt_malloc(bytesOfWeight);
newWeight.bytes_of_vec = bytesOfVec;
- newWeight.vec = (U8 *)mt_new_storage(bytesOfVec);
+ newWeight.vec = (U8 *)mt_malloc(bytesOfVec);
newWeight.num_quant_scale = numQuantScale;
- newWeight.weight_scale = (QuantSpec *)mt_new_storage(sizeof(QuantSpec) * numQuantScale);
+ newWeight.weight_scale = (QuantSpec *)mt_malloc(sizeof(QuantSpec) * numQuantScale);
return newWeight;
}
@@ -100,31 +100,18 @@ bool isDeprecatedOpWeight(const ModelSpec *spec, int index)
EE str_copy(I8 *dst, const I8 *src, I32 srcLen, I32 dstLen)
{
- //memset(dst, 0, dstLen);
+ //UNI_MEMSET(dst, 0, dstLen);
//I32 copyLen = UNI_MIN(srcLen, dstLen);
- //memcpy(dst, src, copyLen);
- memset(dst, 0, dstLen);
+ //UNI_MEMCPY(dst, src, copyLen);
+ UNI_MEMSET(dst, 0, dstLen);
I32 copyLen = NAME_LEN - 1;
if (copyLen > srcLen) {
copyLen = srcLen;
}
- memcpy(dst, src, copyLen * sizeof(I8));
+ UNI_MEMCPY(dst, src, copyLen * sizeof(I8));
return SUCCESS;
}
-void *mt_new_storage(size_t size)
-{
- void *ret = nullptr;
- if (size > 0) {
- try {
- ret = operator new(size);
- } catch (const std::bad_alloc &e) {
- UNI_ERROR_LOG("%s alloc %d bytes failed\n", __FUNCTION__, (int)size);
- }
- }
- return ret;
-}
-
std::string concat_dir_file(std::string dir, std::string file)
{
std::string ret;
@@ -143,3 +130,66 @@ std::string concat_dir_file(std::string dir, std::string file)
return ret;
}
+
+std::vector string_parser(std::string s, std::string delimiter)
+{
+ std::vector res;
+ size_t pos = 0;
+ std::string token;
+ while ((pos = s.find(delimiter)) != std::string::npos) {
+ token = s.substr(0, pos);
+ res.push_back(token);
+ s.erase(0, pos + delimiter.length());
+ }
+ res.push_back(s);
+ return res;
+}
+
+void modify_ms_inputs_and_outputs(
+ ModelSpec *ms, std::string modifiedInputs, std::string modifiedOutputs)
+{
+ std::map modifiedStrMap;
+ if (modifiedInputs.length() > 0) {
+ std::vector modified_input_names = string_parser(modifiedInputs, ",");
+ if ((I32)(modified_input_names.size()) != ms->num_inputs) {
+ UNI_ERROR_LOG("input names not match, please check your params meticulously.\n");
+ }
+ for (int i = 0; i < ms->num_inputs; i++) {
+ std::string tmpStr = modified_input_names[i];
+ modifiedStrMap[std::string(ms->input_names[i])] = tmpStr;
+ str_copy(ms->input_names[i], tmpStr.c_str(), tmpStr.length());
+ }
+ }
+ if (modifiedOutputs.length() > 0) {
+ std::vector modified_output_names = string_parser(modifiedOutputs, ",");
+ if ((I32)(modified_output_names.size()) != ms->num_outputs) {
+ UNI_ERROR_LOG("output names not match, please check your params meticulously.\n");
+ }
+ for (int i = 0; i < ms->num_outputs; i++) {
+ std::string tmpStr = modified_output_names[i];
+ modifiedStrMap[std::string(ms->output_names[i])] = tmpStr;
+ str_copy(ms->output_names[i], tmpStr.c_str(), tmpStr.length());
+ }
+ }
+
+ if (modifiedStrMap.size() > 0) {
+ for (I32 i = 0; i < ms->num_operator_specs; i++) {
+ for (U32 j = 0; j < ms->ops[i].num_inputs; j++) {
+ std::string curStr = std::string(ms->ops[i].input_tensors_name[j]);
+ if (modifiedStrMap.find(curStr) != modifiedStrMap.end()) {
+ std::string modifiedStr = modifiedStrMap[curStr];
+ str_copy(ms->ops[i].input_tensors_name[j], modifiedStr.c_str(),
+ modifiedStr.length());
+ }
+ }
+ for (U32 j = 0; j < ms->ops[i].num_outputs; j++) {
+ std::string curStr = std::string(ms->ops[i].output_tensors_name[j]);
+ if (modifiedStrMap.find(curStr) != modifiedStrMap.end()) {
+ std::string modifiedStr = modifiedStrMap[curStr];
+ str_copy(ms->ops[i].output_tensors_name[j], modifiedStr.c_str(),
+ modifiedStr.length());
+ }
+ }
+ }
+ }
+}
diff --git a/common/model_spec/src/model_deserialize.cpp b/common/model_spec/src/model_deserialize.cpp
index 13f8bcbb..8388c5a9 100644
--- a/common/model_spec/src/model_deserialize.cpp
+++ b/common/model_spec/src/model_deserialize.cpp
@@ -128,16 +128,16 @@ EE operator_relationship(ModelSpec *spec)
int opNum = spec->num_operator_specs;
spec->num_op_tensor_entries = opNum;
OperatorSpec *opsPtr2 = spec->ops;
- OperatorRelationshipMapEntry *oprmePtr = (OperatorRelationshipMapEntry *)mt_new_storage(
- sizeof(OperatorRelationshipMapEntry) * opNum);
+ OperatorRelationshipMapEntry *oprmePtr =
+ (OperatorRelationshipMapEntry *)mt_malloc(sizeof(OperatorRelationshipMapEntry) * opNum);
spec->op_relationship_entries = oprmePtr;
for (int j = 0; j < opNum; j++) {
str_copy(oprmePtr[j].op, opsPtr2[j].name, NAME_LEN);
int opInOpNum = opInTensorNew[opsPtr2[j].name].size();
oprmePtr[j].num_inputs = opInOpNum;
- oprmePtr[j].input_op_names = (I8 **)mt_new_storage(opInOpNum * sizeof(I8 *));
+ oprmePtr[j].input_op_names = (I8 **)mt_malloc(opInOpNum * sizeof(I8 *));
for (int k = 0; k < opInOpNum; k++) {
- oprmePtr[j].input_op_names[k] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+ oprmePtr[j].input_op_names[k] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
std::string ten_name = opInTensorNew[opsPtr2[j].name][k];
std::string tensor2op = tensorOpMapping[ten_name];
str_copy(oprmePtr[j].input_op_names[k], tensor2op.c_str(), tensor2op.length());
@@ -145,9 +145,9 @@ EE operator_relationship(ModelSpec *spec)
int opOutOpNum = tensorFlowsToOpSet[opOutTensorNew[opsPtr2[j].name]].size();
oprmePtr[j].num_outputs = opOutOpNum;
- oprmePtr[j].output_op_names = (I8 **)mt_new_storage(opOutOpNum * sizeof(I8 *));
+ oprmePtr[j].output_op_names = (I8 **)mt_malloc(opOutOpNum * sizeof(I8 *));
for (int k = 0; k < opOutOpNum; k++) {
- oprmePtr[j].output_op_names[k] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+ oprmePtr[j].output_op_names[k] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
std::string tensor2op = tensorFlowsToOpSet[opOutTensorNew[opsPtr2[j].name]][k];
str_copy(oprmePtr[j].output_op_names[k], tensor2op.c_str(), tensor2op.length());
}
@@ -163,11 +163,11 @@ void dequantize_int8_weight(int num, F32 scale, INT8 *q, T *d)
int base = -127;
for (int i = 0; i < 255; i++) {
F32 value = factor * base;
-#ifndef __aarch64__
+#ifndef _USE_FP16
if (dt != DT_F16) {
#endif
table[i] = value;
-#ifndef __aarch64__
+#ifndef _USE_FP16
} else {
transformFromFloat(DT_F16, &value, table + i, 1);
}
@@ -184,7 +184,7 @@ template
inline void deserialize_field(const char **buffer, U32 *position, T *element, int length = 1)
{
int size = length * sizeof(T);
- memcpy(element, *buffer, size);
+ UNI_MEMCPY(element, *buffer, size);
*buffer += size;
*position += size;
}
@@ -196,18 +196,20 @@ EE deserialize_header(const char *bytes, ModelSpec *spec, U32 *pos)
deserialize_field(pointer, pos, &spec->version);
if (spec->version != sg_boltVersion) {
- UNI_ERROR_LOG("X2bolt version is [%d], but your model version is : [%d].\n Please update "
- "X2bolt to version[%d].\n",
- sg_boltVersion, spec->version, spec->version);
- CHECK_STATUS(NOT_MATCH);
+ UNI_WARNING_LOG("The read model module version(%d) of the library should match the model "
+ "file of the same version, but your model version is %d. This may "
+ "encounter error.\nPlease use another library or reconverter model.\n",
+ sg_boltVersion, spec->version);
+ }
+ if (spec->version < 20201120) {
+ UNI_ERROR_LOG("This library can not read model with version(%d),\n", spec->version);
return NOT_MATCH;
}
deserialize_field(pointer, pos, &spec->magic_number);
if (spec->magic_number != sg_magicNumber) {
- UNI_ERROR_LOG(
- "magic_number not_match: code %d bolt model %d\n", sg_magicNumber, spec->magic_number);
- CHECK_STATUS(NOT_MATCH);
+ UNI_ERROR_LOG("magic number not match: library is %d, bolt model is %d\n", sg_magicNumber,
+ spec->magic_number);
return NOT_MATCH;
}
@@ -215,18 +217,18 @@ EE deserialize_header(const char *bytes, ModelSpec *spec, U32 *pos)
deserialize_field(pointer, pos, &spec->dt);
deserialize_field(pointer, pos, &spec->num_inputs);
- spec->input_names = (I8 **)mt_new_storage(spec->num_inputs * sizeof(I8 *));
- spec->input_dims = (TensorDesc *)mt_new_storage(spec->num_inputs * sizeof(TensorDesc));
+ spec->input_names = (I8 **)mt_malloc(spec->num_inputs * sizeof(I8 *));
+ spec->input_dims = (TensorDesc *)mt_malloc(spec->num_inputs * sizeof(TensorDesc));
for (int i = 0; i < spec->num_inputs; i++) {
- spec->input_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+ spec->input_names[i] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
deserialize_field(pointer, pos, spec->input_names[i], NAME_LEN);
}
deserialize_field(pointer, pos, spec->input_dims, spec->num_inputs);
deserialize_field(pointer, pos, &spec->num_outputs);
- spec->output_names = (I8 **)mt_new_storage(spec->num_outputs * NAME_LEN);
+ spec->output_names = (I8 **)mt_malloc(spec->num_outputs * NAME_LEN);
for (int i = 0; i < spec->num_outputs; i++) {
- spec->output_names[i] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+ spec->output_names[i] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
deserialize_field(pointer, pos, spec->output_names[i], NAME_LEN);
}
return SUCCESS;
@@ -238,43 +240,57 @@ EE deserialize_operator(const char *bytes, ModelSpec *spec, U32 *pos)
const char **pointer = &operator_pointer;
deserialize_field(pointer, pos, &spec->num_operator_specs);
- spec->ops = (OperatorSpec *)mt_new_storage(spec->num_operator_specs * sizeof(OperatorSpec));
+ spec->ops = (OperatorSpec *)mt_malloc(spec->num_operator_specs * sizeof(OperatorSpec));
OperatorSpec *ptr = spec->ops;
for (int i = 0; i < spec->num_operator_specs; i++) {
deserialize_field(pointer, pos, ptr[i].name, NAME_LEN);
deserialize_field(pointer, pos, &ptr[i].type);
deserialize_field(pointer, pos, &ptr[i].num_inputs);
- ptr[i].input_tensors_name = (I8 **)mt_new_storage(ptr[i].num_inputs * sizeof(I8 *));
+ ptr[i].input_tensors_name = (I8 **)mt_malloc(ptr[i].num_inputs * sizeof(I8 *));
for (U32 j = 0; j < ptr[i].num_inputs; j++) {
- ptr[i].input_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+ ptr[i].input_tensors_name[j] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
deserialize_field(pointer, pos, ptr[i].input_tensors_name[j], NAME_LEN);
}
deserialize_field(pointer, pos, &ptr[i].num_outputs);
- ptr[i].output_tensors_name = (I8 **)mt_new_storage(ptr[i].num_outputs * sizeof(I8 *));
+ ptr[i].output_tensors_name = (I8 **)mt_malloc(ptr[i].num_outputs * sizeof(I8 *));
for (U32 j = 0; j < ptr[i].num_outputs; j++) {
- ptr[i].output_tensors_name[j] = (I8 *)mt_new_storage(NAME_LEN * sizeof(I8));
+ ptr[i].output_tensors_name[j] = (I8 *)mt_malloc(NAME_LEN * sizeof(I8));
deserialize_field(pointer, pos, ptr[i].output_tensors_name[j], NAME_LEN);
}
U32 numTensors = ptr[i].num_inputs + ptr[i].num_outputs;
- ptr[i].tensor_positions = (I32 *)mt_new_storage(numTensors * sizeof(I32));
+ ptr[i].tensor_positions = (I32 *)mt_malloc(numTensors * sizeof(I32));
deserialize_field(pointer, pos, ptr[i].tensor_positions, numTensors);
deserialize_field(pointer, pos, &ptr[i].num_quant_feature);
- ptr[i].feature_scale =
- (QuantSpec *)mt_new_storage(ptr[i].num_quant_feature * sizeof(QuantSpec));
+ ptr[i].feature_scale = (QuantSpec *)mt_malloc(ptr[i].num_quant_feature * sizeof(QuantSpec));
for (U32 j = 0; j < ptr[i].num_quant_feature; j++) {
deserialize_field(pointer, pos, &(ptr[i].feature_scale[j].num_scale));
ptr[i].feature_scale[j].scale =
- (F32 *)mt_new_storage(ptr[i].feature_scale[j].num_scale * sizeof(F32));
+ (F32 *)mt_malloc(ptr[i].feature_scale[j].num_scale * sizeof(F32));
deserialize_field(
pointer, pos, ptr[i].feature_scale[j].scale, ptr[i].feature_scale[j].num_scale);
}
- deserialize_field(
- pointer, pos, (U8 *)&(ptr[i].ps), get_operator_parameter_size(ptr[i].type));
+ deserialize_field(pointer, pos, (U8 *)&(ptr[i].ps),
+ get_operator_parameter_size(spec->version, ptr[i].type));
+ if (spec->version == 20201120) {
+ if (ptr[i].type == OT_Conv || ptr[i].type == OT_Deconvolution) {
+ ptr[i].ps.conv_spec.output_pad_t = 0;
+ ptr[i].ps.conv_spec.output_pad_h = 0;
+ ptr[i].ps.conv_spec.output_pad_w = 0;
+ }
+ if (ptr[i].type == OT_LayerNorm) {
+ ptr[i].ps.ln_spec.axis = -1;
+ }
+ }
+ if (spec->version == 20201120 || spec->version == 20211021) {
+ if (ptr[i].type == OT_Transpose) {
+ ptr[i].ps.transpose_spec.df = DF_NCHW;
+ }
+ }
}
return SUCCESS;
}
@@ -285,7 +301,7 @@ EE deserialize_weight(const char *bytes, ModelSpec *spec, U32 *pos)
const char **pointer = &weight_pointer;
deserialize_field(pointer, pos, &spec->num_weight_specs);
- spec->ws = (WeightSpec *)mt_new_storage(spec->num_weight_specs * sizeof(WeightSpec));
+ spec->ws = (WeightSpec *)mt_malloc(spec->num_weight_specs * sizeof(WeightSpec));
WeightSpec *ptr = spec->ws;
for (int i = 0; i < spec->num_weight_specs; i++) {
U32 length = 0, count = 0;
@@ -296,17 +312,19 @@ EE deserialize_weight(const char *bytes, ModelSpec *spec, U32 *pos)
bool quantFP16 = false;
bool quantInt8 = false;
- if (DT_F16 == ptr[i].mdt && DT_F32 == spec->dt) {
- ptr[i].mdt = DT_F32;
- quantFP16 = true;
- } else if (DT_I8 == ptr[i].mdt && DT_I8 != spec->dt) {
- if (spec->dt == DT_F16_8Q) {
- ptr[i].mdt = DT_F16;
- } else if (spec->dt == DT_F32_8Q) {
- ptr[i].mdt = DT_F32;
- } else {
- ptr[i].mdt = spec->dt;
+ if (DT_F32 == spec->dt) {
+ if (ptr[i].mdt == DT_F16) {
+ quantFP16 = true;
+ }
+ if (ptr[i].mdt == DT_I8) {
+ quantInt8 = true;
}
+ ptr[i].mdt = DT_F32;
+ } else if (DT_F16_8Q == ptr[i].mdt) {
+ ptr[i].mdt = DT_F16;
+ quantInt8 = true;
+ } else if (DT_F32_8Q == ptr[i].mdt) {
+ ptr[i].mdt = DT_F32;
quantInt8 = true;
}
@@ -338,12 +356,11 @@ EE deserialize_weight(const char *bytes, ModelSpec *spec, U32 *pos)
}
deserialize_field(pointer, pos, &ptr[i].num_quant_scale);
- ptr[i].weight_scale =
- (QuantSpec *)mt_new_storage(ptr[i].num_quant_scale * sizeof(QuantSpec));
+ ptr[i].weight_scale = (QuantSpec *)mt_malloc(ptr[i].num_quant_scale * sizeof(QuantSpec));
for (U32 j = 0; j < ptr[i].num_quant_scale; j++) {
deserialize_field(pointer, pos, &(ptr[i].weight_scale[j].num_scale));
ptr[i].weight_scale[j].scale =
- (F32 *)mt_new_storage(ptr[i].weight_scale[j].num_scale * sizeof(F32));
+ (F32 *)mt_malloc(ptr[i].weight_scale[j].num_scale * sizeof(F32));
deserialize_field(
pointer, pos, ptr[i].weight_scale[j].scale, ptr[i].weight_scale[j].num_scale);
}
@@ -351,21 +368,21 @@ EE deserialize_weight(const char *bytes, ModelSpec *spec, U32 *pos)
CHECK_REQUIREMENT(length == count);
if (quantFP16) {
- ptr[i].weight = (U8 *)mt_new_storage(ptr[i].bytes_of_weight);
- ptr[i].vec = (U8 *)mt_new_storage(ptr[i].bytes_of_vec);
+ ptr[i].weight = (U8 *)mt_malloc(ptr[i].bytes_of_weight);
+ ptr[i].vec = (U8 *)mt_malloc(ptr[i].bytes_of_vec);
transformToFloat(DT_F16, serialWeight, (F32 *)ptr[i].weight, ptr[i].bytes_of_weight / 4);
transformToFloat(DT_F16, serialBias, (F32 *)ptr[i].vec, ptr[i].bytes_of_vec / 4);
} else {
if (quantInt8) {
CHECK_REQUIREMENT(
1 == ptr[i].num_quant_scale && 1 == ptr[i].weight_scale[0].num_scale);
- ptr[i].weight = (U8 *)mt_new_storage(ptr[i].bytes_of_weight);
+ ptr[i].weight = (U8 *)mt_malloc(ptr[i].bytes_of_weight);
F32 scale = ptr[i].weight_scale[0].scale[0];
if (DT_F32 == ptr[i].mdt) {
dequantize_int8_weight(ptr[i].bytes_of_weight / 4, scale,
(INT8 *)serialWeight, (F32 *)ptr[i].weight);
} else if (DT_F16 == ptr[i].mdt) {
-#ifdef __aarch64__
+#ifdef _USE_FP16
dequantize_int8_weight(ptr[i].bytes_of_weight / 2, scale,
(INT8 *)serialWeight, (F16 *)ptr[i].weight);
#else
@@ -375,7 +392,7 @@ EE deserialize_weight(const char *bytes, ModelSpec *spec, U32 *pos)
} else {
UNI_ERROR_LOG(
"Can not support convert INT8 data to %s.\n", DataTypeName()[ptr[i].mdt]);
- exit(1);
+ return NOT_SUPPORTED;
}
} else {
ptr[i].weight = serialWeight;
@@ -389,28 +406,36 @@ EE deserialize_weight(const char *bytes, ModelSpec *spec, U32 *pos)
EE deserialize_model(const char *bytes, ModelSpec *spec)
{
U32 pos = 0;
- CHECK_STATUS(deserialize_header(bytes, spec, &pos));
- CHECK_STATUS(deserialize_operator(bytes, spec, &pos));
- CHECK_STATUS(deserialize_weight(bytes, spec, &pos));
- CHECK_STATUS(operator_relationship(spec));
+ EE ret = deserialize_header(bytes, spec, &pos);
+ if (ret == SUCCESS) {
+ ret = deserialize_operator(bytes, spec, &pos);
+ }
+ if (ret == SUCCESS) {
+ ret = deserialize_weight(bytes, spec, &pos);
+ }
+ if (ret == SUCCESS) {
+ ret = operator_relationship(spec);
+ }
if (spec->mfd->useFileStream) {
spec->mfd->fileLength = pos;
}
- return SUCCESS;
+ return ret;
}
EE deserialize_model_from_file(const char *fn, ModelSpec *spec, bool useFileStream)
{
UNI_DEBUG_LOG("Read bolt model from %s...\n", (useFileStream ? "file stream" : fn));
+ EE ret = NOT_SUPPORTED;
UNI_PROFILE(
{
char *bytes = nullptr;
int fd = -1;
size_t fileLength;
- spec->mfd = (ModelFileDescriptor *)mt_new_storage(sizeof(ModelFileDescriptor));
+ spec->mfd = (ModelFileDescriptor *)mt_malloc(sizeof(ModelFileDescriptor));
spec->mfd->useFileStream = useFileStream;
if (useFileStream) {
bytes = (char *)fn;
+ ret = SUCCESS;
} else {
#ifdef _WIN32
FILE *file = fopen(fn, "rb");
@@ -423,7 +448,7 @@ EE deserialize_model_from_file(const char *fn, ModelSpec *spec, bool useFileStre
fileLength = ftell(file);
rewind(file);
- bytes = (char *)malloc(sizeof(char) * fileLength);
+ bytes = (char *)UNI_MALLOC(sizeof(char) * fileLength);
if (bytes == NULL) {
UNI_ERROR_LOG("Memory allocated for model failed.\n");
}
@@ -459,9 +484,9 @@ EE deserialize_model_from_file(const char *fn, ModelSpec *spec, bool useFileStre
}
spec->mfd->bytes = bytes;
- CHECK_STATUS(deserialize_model(bytes, spec));
+ ret = deserialize_model(bytes, spec);
},
std::string("deserialize_model_from_file"), std::string("prepare"));
UNI_DEBUG_LOG("Read bolt model end.\n");
- return SUCCESS;
+ return ret;
}
diff --git a/common/model_spec/src/model_print.cpp b/common/model_spec/src/model_print.cpp
index f526b79e..5a3f5654 100644
--- a/common/model_spec/src/model_print.cpp
+++ b/common/model_spec/src/model_print.cpp
@@ -16,6 +16,7 @@
void print_header(const ModelSpec ms)
{
+#ifdef _USE_MODEL_PRINT
printf("[Model] %s\n [DataType] %s\n [Inputs] %d\n", ms.model_name, DataTypeName()[ms.dt],
ms.num_inputs);
if (ms.num_inputs > 0) {
@@ -32,10 +33,12 @@ void print_header(const ModelSpec ms)
for (int i = 0; i < ms.num_outputs; i++) {
printf(" %2d %s\n", i, ms.output_names[i]);
}
+#endif
}
void print_operator_tensor_relationship(const ModelSpec ms, bool deleteDeprecatedOp)
{
+#ifdef _USE_MODEL_PRINT
int number = ms.num_operator_specs;
printf(" [Operators] %d\n", number);
if (number > 0) {
@@ -72,10 +75,12 @@ void print_operator_tensor_relationship(const ModelSpec ms, bool deleteDeprecate
}
printf("\n");
}
+#endif
}
void print_weights(const ModelSpec ms)
{
+#ifdef _USE_MODEL_PRINT
std::map vec_data_type;
for (int i = 0; i < ms.num_operator_specs; i++) {
switch (ms.ops[i].type) {
@@ -129,10 +134,12 @@ void print_weights(const ModelSpec ms)
}
printf("\n");
}
+#endif
}
void print_relationship(const ModelSpec ms)
{
+#ifdef _USE_MODEL_PRINT
int number = ms.num_op_tensor_entries;
printf(" [Relationships] %d\n", number);
if (number > 0) {
@@ -149,6 +156,7 @@ void print_relationship(const ModelSpec ms)
}
printf("\n");
}
+#endif
}
void print_ms(const ModelSpec ms)
diff --git a/common/model_spec/src/model_serialize.cpp b/common/model_spec/src/model_serialize.cpp
index 617bc183..136a34ce 100644
--- a/common/model_spec/src/model_serialize.cpp
+++ b/common/model_spec/src/model_serialize.cpp
@@ -18,14 +18,14 @@ EE serialize_header(const ModelSpec *spec, std::string *tmp)
U32 bufSize = sizeof(I32) * 2 + sizeof(I8) * NAME_LEN + sizeof(DataType) + sizeof(I32) +
sizeof(I8) * NAME_LEN * spec->num_inputs + sizeof(TensorDesc) * spec->num_inputs +
sizeof(I32) + sizeof(I8) * NAME_LEN * spec->num_outputs;
- I8 *data = (I8 *)mt_new_storage(bufSize);
+ I8 *data = (I8 *)mt_malloc(bufSize);
I32 *pointer4version = (I32 *)data;
- memcpy(pointer4version, &spec->version, sizeof(I32));
+ UNI_MEMCPY(pointer4version, &spec->version, sizeof(I32));
pointer4version += 1;
I32 *pointer4magicNumber = (I32 *)pointer4version;
- memcpy(pointer4magicNumber, &spec->magic_number, sizeof(I32));
+ UNI_MEMCPY(pointer4magicNumber, &spec->magic_number, sizeof(I32));
pointer4magicNumber += 1;
I8 *pointer4modelName = (I8 *)pointer4magicNumber;
@@ -47,7 +47,7 @@ EE serialize_header(const ModelSpec *spec, std::string *tmp)
}
TensorDesc *pointer4TensorDesc = (TensorDesc *)pointer4InputNames;
- memcpy(pointer4TensorDesc, spec->input_dims, sizeof(TensorDesc) * spec->num_inputs);
+ UNI_MEMCPY(pointer4TensorDesc, spec->input_dims, sizeof(TensorDesc) * spec->num_inputs);
pointer4TensorDesc += spec->num_inputs;
I32 *pointer4numOutputs = (I32 *)pointer4TensorDesc;
@@ -63,7 +63,7 @@ EE serialize_header(const ModelSpec *spec, std::string *tmp)
tmp->clear();
CHECK_REQUIREMENT((U32)(pointer4outputNames - data) == bufSize);
tmp->assign(data, data + bufSize);
- delete data;
+ mt_free(data);
return SUCCESS;
}
@@ -72,7 +72,8 @@ U32 operator_memory_size(OperatorSpec *ops)
// sizeof(U32) * 4 : type + num_inputs + num_output + num_quant_feature
U32 allocatedBufferSize = sizeof(I8) * NAME_LEN + sizeof(U32) * 4 +
ops->num_inputs * NAME_LEN * sizeof(I8) + ops->num_outputs * NAME_LEN * sizeof(I8) +
- (ops->num_inputs + ops->num_outputs) * sizeof(I32) + get_operator_parameter_size(ops->type);
+ (ops->num_inputs + ops->num_outputs) * sizeof(I32) +
+ get_operator_parameter_size(sg_boltVersion, ops->type);
for (U32 i = 0; i < ops->num_quant_feature; i++) {
allocatedBufferSize += sizeof(int); // num_scale
@@ -95,7 +96,7 @@ EE serialize_operators(const ModelSpec *spec, std::string *tmp)
opsTmp++;
}
- char *data = (char *)mt_new_storage(bufSize);
+ char *data = (char *)mt_malloc(bufSize);
I32 *pointer4numOperatorSpecs = (I32 *)data;
*pointer4numOperatorSpecs = spec->num_operator_specs - removeOpNum; // attention
@@ -139,7 +140,7 @@ EE serialize_operators(const ModelSpec *spec, std::string *tmp)
I32 *pointer4tensorPos = (I32 *)pointer4opsOutputTensorsName;
U32 numTensors = opsPointer[i].num_inputs + opsPointer[i].num_outputs;
if (nullptr != opsPointer[i].tensor_positions) {
- memcpy(pointer4tensorPos, opsPointer[i].tensor_positions, numTensors * sizeof(I32));
+ UNI_MEMCPY(pointer4tensorPos, opsPointer[i].tensor_positions, numTensors * sizeof(I32));
} else {
for (U32 j = 0; j < numTensors; j++) {
pointer4tensorPos[j] = -1;
@@ -156,13 +157,13 @@ EE serialize_operators(const ModelSpec *spec, std::string *tmp)
*pointer4quant = opsPointer[i].feature_scale[j].num_scale;
int num = *pointer4quant;
pointer4quant++;
- memcpy(pointer4quant, opsPointer[i].feature_scale[j].scale, num * sizeof(F32));
+ UNI_MEMCPY(pointer4quant, opsPointer[i].feature_scale[j].scale, num * sizeof(F32));
pointer4quant += num;
}
char *pointer4parameterSpecs = (char *)pointer4quant;
- int operatorParameterSize = get_operator_parameter_size(opsPointer[i].type);
- memcpy(pointer4parameterSpecs, &(opsPointer[i].ps), operatorParameterSize);
+ int operatorParameterSize = get_operator_parameter_size(sg_boltVersion, opsPointer[i].type);
+ UNI_MEMCPY(pointer4parameterSpecs, &(opsPointer[i].ps), operatorParameterSize);
pointer4parameterSpecs += operatorParameterSize;
pointer4opsName = (I8 *)pointer4parameterSpecs;
}
@@ -170,7 +171,7 @@ EE serialize_operators(const ModelSpec *spec, std::string *tmp)
tmp->clear();
CHECK_REQUIREMENT((U32)(pointer4opsName - data) == bufSize);
tmp->assign(data, data + bufSize);
- delete data;
+ mt_free(data);
return SUCCESS;
}
@@ -194,7 +195,7 @@ EE serialize_weights(const ModelSpec *spec, std::string *tmp)
weightCount++;
}
- char *data = (char *)mt_new_storage(bufSize);
+ char *data = (char *)mt_malloc(bufSize);
I32 *pointer4numWeightSpecs = (I32 *)data;
*pointer4numWeightSpecs = weightCount;
@@ -225,7 +226,7 @@ EE serialize_weights(const ModelSpec *spec, std::string *tmp)
pointer4wsBytesOfWeight++;
U8 *pointer4wsWeight = (U8 *)pointer4wsBytesOfWeight;
- memcpy(pointer4wsWeight, wsPointer[i].weight, wsPointer[i].bytes_of_weight);
+ UNI_MEMCPY(pointer4wsWeight, wsPointer[i].weight, wsPointer[i].bytes_of_weight);
pointer4wsWeight += wsPointer[i].bytes_of_weight;
U32 *pointer4wsBytesOfVec = (U32 *)pointer4wsWeight;
@@ -233,7 +234,7 @@ EE serialize_weights(const ModelSpec *spec, std::string *tmp)
pointer4wsBytesOfVec++;
U8 *pointer4wsVec = (U8 *)pointer4wsBytesOfVec;
- memcpy(pointer4wsVec, wsPointer[i].vec, wsPointer[i].bytes_of_vec);
+ UNI_MEMCPY(pointer4wsVec, wsPointer[i].vec, wsPointer[i].bytes_of_vec);
pointer4wsVec += wsPointer[i].bytes_of_vec;
U32 *pointer4numquant = (U32 *)pointer4wsVec;
@@ -245,7 +246,7 @@ EE serialize_weights(const ModelSpec *spec, std::string *tmp)
*pointer4quant = wsPointer[i].weight_scale[j].num_scale;
int num = *pointer4quant;
pointer4quant++;
- memcpy(pointer4quant, wsPointer[i].weight_scale[j].scale, num * sizeof(F32));
+ UNI_MEMCPY(pointer4quant, wsPointer[i].weight_scale[j].scale, num * sizeof(F32));
pointer4quant += num;
}
@@ -255,7 +256,7 @@ EE serialize_weights(const ModelSpec *spec, std::string *tmp)
tmp->clear();
CHECK_REQUIREMENT((U32)(pointer4wsOpName - data) == bufSize);
tmp->assign(data, data + bufSize);
- delete data;
+ mt_free(data);
return SUCCESS;
}
@@ -299,8 +300,10 @@ EE serialize_model_to_file(const ModelSpec *spec, const char *fn)
{
UNI_DEBUG_LOG("Write bolt model to %s...\n", fn);
std::string bytes = "";
- CHECK_STATUS(serialize_model(spec, &bytes));
- CHECK_STATUS(write_to_file(&bytes, fn));
+ EE ret = serialize_model(spec, &bytes);
+ if (ret == SUCCESS) {
+ ret = write_to_file(&bytes, fn);
+ }
UNI_DEBUG_LOG("Write bolt model end.\n");
return SUCCESS;
}
diff --git a/common/model_spec/src/model_spec.cpp b/common/model_spec/src/model_spec.cpp
index 6de15409..0876089c 100644
--- a/common/model_spec/src/model_spec.cpp
+++ b/common/model_spec/src/model_spec.cpp
@@ -15,7 +15,7 @@
#include
#endif
-#include "model_spec.h"
+#include "model_common.h"
EE mt_create_model(ModelSpec *ms)
{
@@ -49,29 +49,22 @@ EE mt_destroy_model(ModelSpec *ms)
if (nullptr != ms->input_names) {
for (int i = 0; i < ms->num_inputs; i++) {
- if (nullptr != ms->input_names[i]) {
- delete ms->input_names[i];
- }
- ms->input_names[i] = nullptr;
+ mt_free(ms->input_names[i]);
}
- delete ms->input_names;
- ms->input_names = nullptr;
+ ms->num_inputs = 0;
+ mt_free(ms->input_names);
}
if (nullptr != ms->input_dims) {
- delete ms->input_dims;
- ms->input_dims = nullptr;
+ mt_free(ms->input_dims);
}
if (nullptr != ms->output_names) {
for (int i = 0; i < ms->num_outputs; i++) {
- if (nullptr != ms->output_names[i]) {
- delete ms->output_names[i];
- }
- ms->output_names[i] = nullptr;
+ mt_free(ms->output_names[i]);
}
- delete ms->output_names;
- ms->output_names = nullptr;
+ ms->num_outputs = 0;
+ mt_free(ms->output_names);
}
if (nullptr != ms->ops) {
@@ -79,92 +72,79 @@ EE mt_destroy_model(ModelSpec *ms)
for (int i = 0; i < op_num; i++) {
if (nullptr != ms->ops[i].input_tensors_name) {
for (U32 j = 0; j < ms->ops[i].num_inputs; j++) {
- if (nullptr != ms->ops[i].input_tensors_name[j]) {
- delete ms->ops[i].input_tensors_name[j];
- }
- ms->ops[i].input_tensors_name[j] = nullptr;
+ mt_free(ms->ops[i].input_tensors_name[j]);
}
- delete ms->ops[i].input_tensors_name;
- ms->ops[i].input_tensors_name = nullptr;
+ ms->ops[i].num_inputs = 0;
+ mt_free(ms->ops[i].input_tensors_name);
}
if (nullptr != ms->ops[i].output_tensors_name) {
for (U32 j = 0; j < ms->ops[i].num_outputs; j++) {
- if (nullptr != ms->ops[i].output_tensors_name[j]) {
- delete ms->ops[i].output_tensors_name[j];
- }
- ms->ops[i].output_tensors_name[j] = nullptr;
+ mt_free(ms->ops[i].output_tensors_name[j]);
}
- delete ms->ops[i].output_tensors_name;
- ms->ops[i].output_tensors_name = nullptr;
- }
-
- if (nullptr != ms->ops[i].tensor_positions) {
- delete ms->ops[i].tensor_positions;
+ ms->ops[i].num_outputs = 0;
+ mt_free(ms->ops[i].output_tensors_name);
}
+ mt_free(ms->ops[i].tensor_positions);
if (0 != ms->ops[i].num_quant_feature && nullptr != ms->ops[i].feature_scale) {
for (U32 j = 0; j < ms->ops[i].num_quant_feature; j++) {
if (0 != ms->ops[i].feature_scale[j].num_scale) {
- if (nullptr != ms->ops[i].feature_scale[j].scale) {
- delete ms->ops[i].feature_scale[j].scale;
- }
+ ms->ops[i].feature_scale[j].num_scale = 0;
+ mt_free(ms->ops[i].feature_scale[j].scale);
}
}
- delete ms->ops[i].feature_scale;
+ ms->ops[i].num_quant_feature = 0;
+ mt_free(ms->ops[i].feature_scale);
}
}
- delete ms->ops;
- ms->ops = nullptr;
+ ms->num_operator_specs = 0;
+ mt_free(ms->ops);
}
if (nullptr != ms->ws) {
- int weightOpNum = ms->num_weight_specs;
- for (int i = 0; i < weightOpNum; i++) {
- if (nullptr != ms->ws[i].weight && outOfFileMapRange(ms->ws[i].weight, ms->mfd)) {
- delete ms->ws[i].weight;
- }
- ms->ws[i].weight = nullptr;
- if (nullptr != ms->ws[i].vec && outOfFileMapRange(ms->ws[i].vec, ms->mfd)) {
- delete ms->ws[i].vec;
+ for (int i = 0; i < ms->num_weight_specs; i++) {
+ ms->ws[i].bytes_of_weight = 0;
+ mt_free(ms->ws[i].weight, ms);
+ ms->ws[i].bytes_of_vec = 0;
+ mt_free(ms->ws[i].vec, ms);
+ for (U32 j = 0; j < ms->ws[i].num_quant_scale; j++) {
+ if (0 != ms->ws[i].weight_scale[j].num_scale) {
+ ms->ws[i].weight_scale[j].num_scale = 0;
+ mt_free(ms->ws[i].weight_scale[j].scale);
+ }
}
- ms->ws[i].vec = nullptr;
+ ms->ws[i].num_quant_scale = 0;
+ mt_free(ms->ws[i].weight_scale);
}
- delete ms->ws;
- ms->ws = nullptr;
+ ms->num_weight_specs = 0;
+ mt_free(ms->ws);
}
if (nullptr != ms->op_relationship_entries) {
- int numOpRelationPair = ms->num_op_tensor_entries;
- for (int i = 0; i < numOpRelationPair; i++) {
+ for (int i = 0; i < ms->num_op_tensor_entries; i++) {
if (nullptr != ms->op_relationship_entries[i].input_op_names) {
for (U32 j = 0; j < ms->op_relationship_entries[i].num_inputs; j++) {
- if (nullptr != ms->op_relationship_entries[i].input_op_names[j]) {
- delete ms->op_relationship_entries[i].input_op_names[j];
- }
- ms->op_relationship_entries[i].input_op_names[j] = nullptr;
+ mt_free(ms->op_relationship_entries[i].input_op_names[j]);
}
- delete ms->op_relationship_entries[i].input_op_names;
- ms->op_relationship_entries[i].input_op_names = nullptr;
+ ms->op_relationship_entries[i].num_inputs = 0;
+ mt_free(ms->op_relationship_entries[i].input_op_names);
}
if (nullptr != ms->op_relationship_entries[i].output_op_names) {
for (U32 j = 0; j < ms->op_relationship_entries[i].num_outputs; j++) {
- if (nullptr != ms->op_relationship_entries[i].output_op_names[j]) {
- delete ms->op_relationship_entries[i].output_op_names[j];
- }
- ms->op_relationship_entries[i].output_op_names[j] = nullptr;
+ mt_free(ms->op_relationship_entries[i].output_op_names[j]);
}
- delete ms->op_relationship_entries[i].output_op_names;
- ms->op_relationship_entries[i].output_op_names = nullptr;
+ ms->op_relationship_entries[i].num_outputs = 0;
+ mt_free(ms->op_relationship_entries[i].output_op_names);
}
}
- delete ms->op_relationship_entries;
- ms->op_relationship_entries = nullptr;
+ ms->num_op_tensor_entries = 0;
+ mt_free(ms->op_relationship_entries);
}
if (ms->mfd != nullptr && !ms->mfd->useFileStream && ms->mfd->bytes != nullptr) {
#ifdef _WIN32
// use fread to read model file
- free(ms->mfd->bytes);
+ UNI_FREE(ms->mfd->bytes);
#else
// use mmap to read model file
munmap(ms->mfd->bytes, ms->mfd->fileLength);
@@ -173,9 +153,6 @@ EE mt_destroy_model(ModelSpec *ms)
}
#endif
}
-
- delete ms->mfd;
- ms->mfd = nullptr;
-
+ mt_free(ms->mfd);
return SUCCESS;
}
diff --git a/common/uni/include/affinity_policy.h b/common/uni/include/affinity_policy.h
new file mode 100644
index 00000000..b0f9b85f
--- /dev/null
+++ b/common/uni/include/affinity_policy.h
@@ -0,0 +1,94 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _H_AFFINITY_POLICY
+#define _H_AFFINITY_POLICY
+
+#include "sys.h"
+#ifdef _USE_OPENMP
+#include
+#define OMP_MAX_NUM_THREADS \
+ (getenv("OMP_NUM_THREADS") == NULL ? omp_get_num_procs() : atoi(getenv("OMP_NUM_THREADS")))
+#else
+#define OMP_MAX_NUM_THREADS 1
+#endif
+extern int OMP_NUM_THREADS;
+const int CPU_MAX_NUMBER = 128;
+
+typedef enum {
+ AFFINITY_CPU = 0,
+ AFFINITY_CPU_LOW_POWER = 1,
+ AFFINITY_CPU_HIGH_PERFORMANCE = 2,
+ AFFINITY_GPU = 3
+} AffinityPolicy;
+
+typedef struct CpuStat {
+ unsigned long idle;
+ unsigned long total;
+} CpuStat;
+
+typedef struct DeviceInfo {
+ int cpuNum;
+ Arch archs[CPU_MAX_NUMBER];
+ long freqs[CPU_MAX_NUMBER];
+ float occupys[CPU_MAX_NUMBER];
+ int cpuids[CPU_MAX_NUMBER];
+ CpuStat cpuStats[CPU_MAX_NUMBER];
+
+ float maxOccupy;
+ AffinityPolicy affinityPolicy;
+ Arch schedule;
+} DeviceInfo;
+
+inline const char *const *AffinityPolicyNames()
+{
+ static const char *const names[] = {
+ "CPU_AFFINITY", "CPU_AFFINITY_LOW_POWER", "CPU_AFFINITY_HIGH_PERFORMANCE", "GPU"};
+ return names;
+}
+
+inline const AffinityPolicy *AffinityPolicies()
+{
+ static const AffinityPolicy policies[] = {
+ AFFINITY_CPU, AFFINITY_CPU_LOW_POWER, AFFINITY_CPU_HIGH_PERFORMANCE, AFFINITY_GPU};
+ return policies;
+}
+
+inline AffinityPolicy thread_affinity_get_policy_by_name(const char *name)
+{
+ for (int i = 0; i < 4; i++) {
+ const char *target = AffinityPolicyNames()[i];
+ if (strcmp(target, name) == 0) {
+ return AffinityPolicies()[i];
+ }
+ }
+ return AFFINITY_CPU_HIGH_PERFORMANCE;
+}
+
+inline void set_cpu_num_threads(int threadNum)
+{
+#ifndef _USE_OPENMP
+ if (threadNum > 1) {
+ UNI_WARNING_LOG("this library not support multi-threads parallel, please rebuild with "
+ "--openmp option.\n");
+ }
+#endif
+ if (threadNum < 0) {
+ threadNum = 1;
+ }
+ if (threadNum > OMP_MAX_NUM_THREADS) {
+ threadNum = OMP_MAX_NUM_THREADS;
+ }
+ OMP_NUM_THREADS = threadNum;
+}
+#endif
diff --git a/common/uni/include/algorithm_map.h b/common/uni/include/algorithm_map.h
index 22c315e9..5adecb42 100644
--- a/common/uni/include/algorithm_map.h
+++ b/common/uni/include/algorithm_map.h
@@ -58,9 +58,7 @@ class AlgorithmMap {
if (i == 96) {
continue;
}
- char j[8];
- sprintf(j, "%c", i);
- charSet.insert(j);
+ charSet.insert(std::string(1, i));
}
std::string name = modelName;
diff --git a/common/uni/include/arm_neon_expand.h b/common/uni/include/arm_neon_expand.h
index 83580162..0ff739c1 100644
--- a/common/uni/include/arm_neon_expand.h
+++ b/common/uni/include/arm_neon_expand.h
@@ -344,7 +344,7 @@ inline void vst1q_lane_f16_builtin(__fp16 *address, float16x8_t vec, const int l
#endif
#ifdef _USE_INT8
-#ifdef __aarch64__
+#ifdef _USE_FP16
inline int32x4_t vdotq_laneq_s32_builtin(int32x4_t c, int8x16_t a, int8x16_t b, const int laneId)
{
int32x4_t ret;
diff --git a/common/uni/include/array_transpose.h b/common/uni/include/array_transpose.h
index 52380110..579ef341 100644
--- a/common/uni/include/array_transpose.h
+++ b/common/uni/include/array_transpose.h
@@ -14,7 +14,8 @@
#ifndef _H_ARRAY_TRANSPOSE
#define _H_ARRAY_TRANSPOSE
-#include "string.h"
+#include "secure_c_wrapper.h"
+#include "affinity_policy.h"
template
static inline void inner_transpose_template(unsigned int tileSize,
@@ -26,25 +27,33 @@ static inline void inner_transpose_template(unsigned int tileSize,
int inputDimsNum,
int outputDimsNum,
unsigned int outputSize,
- int sizeInnerIndex,
- unsigned int *inputLocalIndex)
+ int sizeInnerIndex)
{
- for (unsigned int i = 0; i < outputSize; i++) {
- unsigned int outputIndex = i;
- for (int j = sizeInnerIndex; j < outputDimsNum; j++) {
- unsigned int value = outputIndex % outputDims[j];
- outputIndex /= outputDims[j];
- inputLocalIndex[inputDimsNum - 1 - transposeDims[outputDimsNum - 1 - j]] = value;
- }
- unsigned int inputIndex = 0;
- for (int j = inputDimsNum - 1; j > sizeInnerIndex; j--) {
- inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1];
- }
- inputIndex += inputLocalIndex[sizeInnerIndex];
- if (branch == 0) {
- *(output + i) = *(input + inputIndex);
- } else {
- memcpy(output + i * tileSize, input + inputIndex * tileSize, tileSize);
+#ifdef _USE_OPENMP
+#pragma omp parallel num_threads(OMP_NUM_THREADS)
+#endif
+ {
+ std::vector inputLocalIndex(inputDimsNum);
+#ifdef _USE_OPENMP
+#pragma omp for
+#endif
+ for (unsigned int i = 0; i < outputSize; i++) {
+ unsigned int outputIndex = i;
+ for (int j = sizeInnerIndex; j < outputDimsNum; j++) {
+ unsigned int value = outputIndex % outputDims[j];
+ outputIndex /= outputDims[j];
+ inputLocalIndex[inputDimsNum - 1 - transposeDims[outputDimsNum - 1 - j]] = value;
+ }
+ unsigned int inputIndex = 0;
+ for (int j = inputDimsNum - 1; j > sizeInnerIndex; j--) {
+ inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1];
+ }
+ inputIndex += inputLocalIndex[sizeInnerIndex];
+ if (branch == 0) {
+ *(output + i) = *(input + inputIndex);
+ } else {
+ UNI_MEMCPY(output + i * tileSize, input + inputIndex * tileSize, tileSize);
+ }
}
}
}
@@ -58,15 +67,6 @@ inline void array_transpose(unsigned int elementSize,
int inputDimsNum,
int outputDimsNum)
{
- unsigned int inputSize = 1, outputSize = 1;
- for (int i = 0; i < inputDimsNum; i++) {
- inputSize *= inputDims[i];
- }
- for (int i = 0; i < outputDimsNum; i++) {
- outputSize *= outputDims[i];
- }
- CHECK_REQUIREMENT(inputSize == outputSize);
-
unsigned int sizeInner = 1;
int sizeInnerIndex = 0;
for (int i = outputDimsNum - 1; i >= 0; i--) {
@@ -77,23 +77,55 @@ inline void array_transpose(unsigned int elementSize,
break;
}
}
+ int tileSize = elementSize * sizeInner;
+ int in = inputDims[inputDimsNum - 1], ihiw = 0, ic = 0;
+ if (outputDimsNum - sizeInnerIndex == 3 && transposeDims[0] == 0 && transposeDims[1] == 2 &&
+ transposeDims[2] == 1) {
+ ic = inputDims[inputDimsNum - 2];
+ ihiw = inputDims[inputDimsNum - 3];
+ }
+ if (outputDimsNum - sizeInnerIndex == 4 && transposeDims[0] == 0 && transposeDims[1] == 2 &&
+ transposeDims[2] == 3 && transposeDims[3] == 1) {
+ ic = inputDims[inputDimsNum - 2];
+ ihiw = inputDims[inputDimsNum - 3] * inputDims[inputDimsNum - 4];
+ }
+ if (ic > 0 && ihiw > 0 && input != output) {
+#ifdef _USE_OPENMP
+#pragma omp parallel for num_threads(OMP_NUM_THREADS)
+#endif
+ for (int o = 0; o < in * ihiw; o++) {
+ int n = o / ihiw;
+ int hw = o % ihiw;
+ U8 *dst = (U8 *)output + o * ic * tileSize;
+ for (int c = 0; c < ic; c++, dst += tileSize) {
+ const U8 *src = (const U8 *)input + ((n * ic + c) * ihiw + hw) * tileSize;
+ UNI_MEMCPY(dst, src, tileSize);
+ }
+ }
+ return;
+ }
+
+ unsigned int inputSize = 1, outputSize = 1;
+ for (int i = 0; i < inputDimsNum; i++) {
+ inputSize *= inputDims[i];
+ }
+ for (int i = 0; i < outputDimsNum; i++) {
+ outputSize *= outputDims[i];
+ }
+ CHECK_REQUIREMENT(inputSize == outputSize);
outputSize = outputSize / sizeInner;
- std::vector inputLocalIndex(inputDimsNum, 0);
const char *inputPtr = (const char *)input;
char *outputPtr = (char *)output;
if (sizeInner == 1 && elementSize == 4) {
inner_transpose_template<0, int>(elementSize, inputDims, (const int *)input, outputDims,
- (int *)output, transposeDims, inputDimsNum, outputDimsNum, outputSize, sizeInnerIndex,
- inputLocalIndex.data());
+ (int *)output, transposeDims, inputDimsNum, outputDimsNum, outputSize, sizeInnerIndex);
} else if (sizeInner == 1 && elementSize == 2) {
inner_transpose_template<0, short>(elementSize, inputDims, (const short *)input, outputDims,
- (short *)output, transposeDims, inputDimsNum, outputDimsNum, outputSize, sizeInnerIndex,
- inputLocalIndex.data());
+ (short *)output, transposeDims, inputDimsNum, outputDimsNum, outputSize, sizeInnerIndex);
} else {
- inner_transpose_template<1, char>(sizeInner * elementSize, inputDims, (const char *)input,
- outputDims, (char *)output, transposeDims, inputDimsNum, outputDimsNum, outputSize,
- sizeInnerIndex, inputLocalIndex.data());
+ inner_transpose_template<1, char>(tileSize, inputDims, (const char *)input, outputDims,
+ (char *)output, transposeDims, inputDimsNum, outputDimsNum, outputSize, sizeInnerIndex);
}
}
@@ -113,22 +145,31 @@ inline void array_transpose_naive(unsigned int elementSize,
inputSize *= inputDims[i];
outputSize *= outputDims[i];
}
- std::vector inputLocalIndex(dimsNum);
const char *inputPtr = (const char *)input;
char *outputPtr = (char *)output;
- for (unsigned int i = 0; i < outputSize; i++) {
- unsigned int outputIndex = i;
- for (int j = 0; j < dimsNum; j++) {
- unsigned int value = outputIndex % outputDims[j];
- outputIndex /= outputDims[j];
- inputLocalIndex[dimsNum - 1 - transposeDims[dimsNum - 1 - j]] = value;
- }
- unsigned int inputIndex = 0;
- for (int j = dimsNum - 1; j > 0; j--) {
- inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1];
+#ifdef _USE_OPENMP
+#pragma omp parallel num_threads(OMP_NUM_THREADS)
+#endif
+ {
+ std::vector inputLocalIndex(dimsNum);
+#ifdef _USE_OPENMP
+#pragma omp for
+#endif
+ for (unsigned int i = 0; i < outputSize; i++) {
+ unsigned int outputIndex = i;
+ for (int j = 0; j < dimsNum; j++) {
+ unsigned int value = outputIndex % outputDims[j];
+ outputIndex /= outputDims[j];
+ inputLocalIndex[dimsNum - 1 - transposeDims[dimsNum - 1 - j]] = value;
+ }
+ unsigned int inputIndex = 0;
+ for (int j = dimsNum - 1; j > 0; j--) {
+ inputIndex = (inputIndex + inputLocalIndex[j]) * inputDims[j - 1];
+ }
+ inputIndex += inputLocalIndex[0];
+ UNI_MEMCPY(
+ outputPtr + i * elementSize, inputPtr + inputIndex * elementSize, elementSize);
}
- inputIndex += inputLocalIndex[0];
- memcpy(outputPtr + i * elementSize, inputPtr + inputIndex * elementSize, elementSize);
}
}
#endif
diff --git a/common/uni/include/data_type.h b/common/uni/include/data_type.h
index 58dbb121..9c152678 100644
--- a/common/uni/include/data_type.h
+++ b/common/uni/include/data_type.h
@@ -15,9 +15,9 @@
#define _H_DATA_TYPE
#include
-#include
#include
-#ifdef __aarch64__
+#include
+#ifdef _USE_FP16
#include
typedef __fp16 F16;
#endif
@@ -25,8 +25,9 @@ typedef __fp16 F16;
#include
#include
#define FTZ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
-typedef float F16;
#endif
+#define _USE_ULTRA_OPTIMIZATION
+#include "secure_c_wrapper.h"
typedef int8_t INT8;
typedef uint8_t UINT8;
@@ -56,26 +57,41 @@ typedef enum {
DT_BIN11 = 8,
DT_F32_8Q = 9,
DT_U8_Q = 10,
- DT_NUM = 11
+ DT_I64 = 11,
+ DT_U64 = 12,
+ DT_F64 = 13,
+ DT_NUM = 14
} DataType;
inline const char *const *DataTypeName()
{
static const char *const names[] = {"DT_U8", "DT_I8", "DT_U32", "DT_I32", "DT_F16", "DT_F16_8Q",
- "DT_F32", "DT_BIN01", "DT_BIN11", "DT_F32_8Q", "DT_U8_Q", "DT_NUM"};
+ "DT_F32", "DT_BIN01", "DT_BIN11", "DT_F32_8Q", "DT_U8_Q", "DT_I64", "DT_U64", "DT_F64",
+ "DT_NUM"};
return names;
}
inline U32 bytesOf(DataType dt)
{
// Please divide number of elements by 8 first in the case of binary data types
- U32 bytes[] = {1, 1, 4, 4, 2, 2, 4, 1, 1, 4, 1};
- return dt < DT_NUM ? bytes[dt] : 0;
+ U32 bytes[] = {1, 1, 4, 4, 2, 2, 4, 1, 1, 4, 1, 8, 8, 8};
+ U32 ret;
+ if (dt < DT_NUM) {
+ ret = bytes[dt];
+ } else {
+ ret = 0;
+ printf("[ERROR] try to get unknown type:%s bytes.\n", DataTypeName()[dt]);
+ exit(1);
+ }
+ return ret;
}
#ifdef _USE_FP16
inline void transformFromHalf(DataType dataType, const F16 *src, void *dst, int num)
{
+ if (num <= 0) {
+ return;
+ }
if (num % 8 != 0) {
printf("[ERROR] can not support to transformFromHalf for array(length(%d) mod 8 != 0).\n",
num);
@@ -110,6 +126,9 @@ inline void transformFromHalf(DataType dataType, const F16 *src, void *dst, int
inline void transformToHalf(DataType dataType, const void *src, F16 *dst, int num)
{
+ if (num <= 0) {
+ return;
+ }
if (num % 8 != 0) {
printf(
"[ERROR] can not support to transformToHalf for array(length(%d) mod 8 != 0).\n", num);
@@ -148,12 +167,81 @@ inline void transformToHalf(DataType dataType, const void *src, F16 *dst, int nu
}
#endif
+inline void transformToInt(DataType dataType, const void *src, int *dst, int num)
+{
+ if (num <= 0) {
+ return;
+ }
+ switch (dataType) {
+ case DT_I64: {
+ I64 value;
+ const U8 *ptr = (const U8 *)src;
+ for (int i = 0; i < num; i++) {
+ UNI_MEMCPY(&value, ptr, sizeof(I64));
+ ptr += sizeof(I64);
+ value = value > INT_MAX ? INT_MAX : value;
+ dst[i] = value < INT_MIN ? INT_MIN : value;
+ }
+ break;
+ }
+ case DT_U32:
+ case DT_I32: {
+ UNI_MEMCPY(dst, src, sizeof(int) * num);
+ break;
+ }
+ default: {
+ printf("[ERROR] can not transform %s to int.\n", DataTypeName()[dataType]);
+ exit(1);
+ }
+ }
+}
+
+inline unsigned short float32ToFloat16(float value)
+{
+ const U32 *word = (const U32 *)(&value);
+ unsigned short sign = (word[0] & 0x80000000) >> 31;
+ unsigned short exponent = (word[0] & 0x7F800000) >> 23;
+ unsigned int significand = word[0] & 0x7FFFFF;
+
+ unsigned short u;
+ if (exponent == 0) {
+ u = (sign << 15) | (0x00 << 10) | 0x00;
+ } else if (exponent == 0xFF) {
+ u = (sign << 15) | (0x1F << 10) | (significand ? 0x200 : 0x00);
+ } else {
+ short newexp = exponent + (-127 + 15);
+ if (newexp >= 31) {
+ u = (sign << 15) | (0x1F << 10) | 0x00;
+ } else if (newexp <= 0) {
+ if (newexp >= -10) {
+ unsigned short sig = (significand | 0x800000) >> (14 - newexp);
+ u = (sign << 15) | (0x00 << 10) | sig;
+ } else {
+ u = (sign << 15) | (0x00 << 10) | 0x00;
+ }
+ } else {
+ u = (sign << 15) | (newexp << 10) | (significand >> 13);
+ }
+ }
+ return u;
+}
+
inline void transformFromFloat(
DataType dataType, const float *src, void *dst, int num, float scale = 1)
{
+ if (num <= 0) {
+ return;
+ }
switch (dataType) {
case DT_F32: {
- memcpy(dst, src, sizeof(float) * num);
+ UNI_MEMCPY(dst, src, sizeof(float) * num);
+ break;
+ }
+ case DT_I64: {
+ I64 *ptr = (I64 *)dst;
+ for (int i = 0; i < num; i++) {
+ ptr[i] = src[i];
+ }
break;
}
case DT_U32: {
@@ -172,41 +260,16 @@ inline void transformFromFloat(
}
case DT_F16_8Q:
case DT_F16: {
-#ifdef __aarch64__
+#ifdef _USE_FP16
F16 *ptr = (F16 *)dst;
#else
- const U32 *word = (const U32 *)src;
unsigned short *q = (unsigned short *)dst;
#endif
for (int i = 0; i < num; i++) {
-#ifdef __aarch64__
+#ifdef _USE_FP16
ptr[i] = src[i];
#else
- unsigned short sign = (word[i] & 0x80000000) >> 31;
- unsigned short exponent = (word[i] & 0x7F800000) >> 23;
- unsigned int significand = word[i] & 0x7FFFFF;
-
- unsigned short u;
- if (exponent == 0) {
- u = (sign << 15) | (0x00 << 10) | 0x00;
- } else if (exponent == 0xFF) {
- u = (sign << 15) | (0x1F << 10) | (significand ? 0x200 : 0x00);
- } else {
- short newexp = exponent + (-127 + 15);
- if (newexp >= 31) {
- u = (sign << 15) | (0x1F << 10) | 0x00;
- } else if (newexp <= 0) {
- if (newexp >= -10) {
- unsigned short sig = (significand | 0x800000) >> (14 - newexp);
- u = (sign << 15) | (0x00 << 10) | sig;
- } else {
- u = (sign << 15) | (0x00 << 10) | 0x00;
- }
- } else {
- u = (sign << 15) | (newexp << 10) | (significand >> 13);
- }
- }
- q[i] = u;
+ q[i] = float32ToFloat16(src[i]);
#endif
}
break;
@@ -235,10 +298,20 @@ inline void transformFromFloat(
inline void transformToFloat(
DataType dataType, const void *src, float *dst, int num, float scale = 1)
{
+ if (num <= 0) {
+ return;
+ }
switch (dataType) {
case DT_F32_8Q:
case DT_F32: {
- memcpy(dst, src, sizeof(float) * num);
+ UNI_MEMCPY(dst, src, sizeof(float) * num);
+ break;
+ }
+ case DT_I64: {
+ const I64 *ptr = (const I64 *)src;
+ for (int i = 0; i < num; i++) {
+ dst[i] = ptr[i];
+ }
break;
}
case DT_U32: {
@@ -257,14 +330,14 @@ inline void transformToFloat(
}
case DT_F16_8Q:
case DT_F16: {
-#ifdef __aarch64__
+#ifdef _USE_FP16
const F16 *ptr = (const F16 *)src;
#else
const unsigned short *q = (const unsigned short *)src;
U32 *word = (U32 *)dst;
#endif
for (int i = 0; i < num; i++) {
-#ifdef __aarch64__
+#ifdef _USE_FP16
dst[i] = ptr[i];
#else
unsigned short value = q[i];
@@ -350,13 +423,19 @@ inline void transformToFloat(
inline void UNI_INIT(U32 num, DataType dt, F32 val, void *dst)
{
+ if (num <= 0) {
+ return;
+ }
+ if (val == 0) {
+ UNI_MEMSET(dst, 0, bytesOf(dt) * num);
+ return;
+ }
switch (dt) {
case DT_F16: {
- unsigned int short mem;
- transformFromFloat(DT_F16, &val, &mem, 1);
- U8 *arr = (U8 *)dst;
+ unsigned short mem = float32ToFloat16(val);
+ unsigned short *arr = (unsigned short *)dst;
for (U32 i = 0; i < num; i++) {
- memcpy(arr + i * bytesOf(DT_F16), &mem, bytesOf(DT_F16));
+ arr[i] = mem;
}
break;
}
diff --git a/common/uni/include/error.h b/common/uni/include/error.h
index 00af4c70..e35e2227 100644
--- a/common/uni/include/error.h
+++ b/common/uni/include/error.h
@@ -19,7 +19,12 @@
#include
#ifdef _WIN32
+#ifdef _USE_JNI
#define UNI_THREADID int tid = 0;
+#else
+#include
+#define UNI_THREADID int tid = GetThreadId(GetCurrentThread());
+#endif
#elif defined(__GLIBC__) || defined(__linux__)
#include
#define UNI_THREADID pid_t tid = syscall(SYS_gettid);
@@ -80,23 +85,23 @@ extern "C" {
}) \
}
-#define UNI_WARNING_LOG(...) \
- { \
- UNI_THREADID \
- UNI_THREAD_SAFE({ \
- UNI_LOGD("[WARNING] thread %d file %s line %d ", tid, __FILE__, __LINE__); \
- UNI_LOGD(__VA_ARGS__); \
- }) \
+#define UNI_WARNING_LOG(...) \
+ { \
+ UNI_THREADID \
+ UNI_THREAD_SAFE({ \
+ UNI_LOGD("[WARNING] thread %d file %s line %d: ", tid, __FILE__, __LINE__); \
+ UNI_LOGD(__VA_ARGS__); \
+ }) \
}
-#define UNI_ERROR_LOG(...) \
- { \
- UNI_THREADID \
- UNI_THREAD_SAFE({ \
- UNI_LOGD("[ERROR] thread %d file %s line %d ", tid, __FILE__, __LINE__); \
- UNI_LOGD(__VA_ARGS__); \
- }) \
- UNI_EXIT; \
+#define UNI_ERROR_LOG(...) \
+ { \
+ UNI_THREADID \
+ UNI_THREAD_SAFE({ \
+ UNI_LOGD("[ERROR] thread %d file %s line %d: ", tid, __FILE__, __LINE__); \
+ UNI_LOGD(__VA_ARGS__); \
+ }) \
+ UNI_EXIT; \
}
#ifdef _DEBUG
diff --git a/common/uni/include/memory_cpu.h b/common/uni/include/memory_cpu.h
new file mode 100644
index 00000000..271f9156
--- /dev/null
+++ b/common/uni/include/memory_cpu.h
@@ -0,0 +1,123 @@
+// Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"),
+// to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#ifndef _H_UNI_MEMORY_CPU
+#define _H_UNI_MEMORY_CPU
+
+#include "secure_c_wrapper.h"
+#include
+#ifdef _USE_MEM_CHECK
+#include